From 35e396cd100489dfe8f5a76e3613fb8049ffdff3 Mon Sep 17 00:00:00 2001
From: "xiphmont@xiph.org" <xiphmont@xiph.org>
Date: Fri, 22 Aug 2008 11:12:21 +0200
Subject: [PATCH 001/132] SG_IO block filter whitelist missing MMC SET READ
 AHEAD command

I have another request for the block filter SG_IO command whitelist,
specifically the MMC streaming command set SET READ AHEAD command.
The command applies only to MMC CDROM/DVDROM drives with the streaming
optional feature set.  The command is useful to cdparanoia in that it
allows explicit cache control side effects that are, on many drives,
cdparanoia's most efficient way to flush/disable the media cache on
cdrom drives. I am aware of no reason why it should not be accessible
from usespace.

Also note that the command is already fully accessible through the
SCSI-native version of the SG_IO ioctl as well as the traditional SG
interface.  The command is only being refused on block devices.  That
means that on a typical stock distro, the command is available through
/dev/sg* but not /dev/scd* although both are typically available and
accessible.  Filtering the command is not providing any protection,
only a confusing inconsistency.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/scsi_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index ec4b7f234626..3aab80a4c484 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -185,6 +185,7 @@ void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
 	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
 	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
 	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
+	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
 }
 EXPORT_SYMBOL_GPL(blk_set_cmd_filter_defaults);
 

From 7a67f63b3233ff28e753854fe27891c44f8588ae Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 8 Aug 2008 11:17:12 +0200
Subject: [PATCH 002/132] block: add bio_has_data() to detect whether a bio
 carries data or not

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0933a14e6414..9e93c9299479 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -445,6 +445,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 
+/*
+ * Check whether this bio carries any data or not. A NULL bio is allowed.
+ */
+static inline int bio_has_data(struct bio *bio)
+{
+	return bio && bio->bi_io_vec != NULL;
+}
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))

From a9c701e594669dd49fed448c27c64f20cfacc8a7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 8 Aug 2008 11:04:44 +0200
Subject: [PATCH 003/132] block: use bio_has_data() to check for data carrying
 bio

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c    | 5 +----
 include/linux/bio.h | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2cba5ef97b2b..54e442ba44aa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1490,10 +1490,7 @@ void submit_bio(int rw, struct bio *bio)
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio)) {
-
-		BIO_BUG_ON(!bio->bi_size);
-		BIO_BUG_ON(!bio->bi_io_vec);
+	if (bio_has_data(bio)) {
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9e93c9299479..dbeb66f813ab 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -185,7 +185,7 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
-#define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
+#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {

From 051cc3952a8fb6fa875a4eff68d06cf42207dcf4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 8 Aug 2008 11:06:45 +0200
Subject: [PATCH 004/132] block: use bio_has_data() in the IO completion path

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 54e442ba44aa..b5776c1fd52a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1885,7 +1885,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
+	if (bio_has_data(rq->bio)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1943,10 +1943,9 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if (blk_fs_request(rq) || blk_pc_request(rq)) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
-	}
+	if (bio_has_data(rq->bio) &&
+	    __end_that_request_first(rq, error, nr_bytes))
+		return 1;
 
 	add_disk_randomness(rq->rq_disk);
 

From 36144077bce9f89763ce994bc631cbd1c9db7785 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 14 Aug 2008 13:12:15 +0200
Subject: [PATCH 005/132] highmem: use bio_has_data() in the bounce path

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 mm/bounce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	/*
 	 * Data-less bio, nothing to bounce
 	 */
-	if (bio_empty_barrier(*bio_orig))
+	if (!bio_has_data(*bio_orig))
 		return;
 
 	/*

From d628eaef310533767ce68664873869c2d7f78f09 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sat, 9 Aug 2008 16:22:17 +0100
Subject: [PATCH 006/132] Fix up comments about matching flags between bio and
 rq

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 7 ++-----
 include/linux/bio.h    | 4 ++--
 include/linux/blkdev.h | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b5776c1fd52a..a496727df7ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -624,10 +624,6 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 
 	blk_rq_init(q, rq);
 
-	/*
-	 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
-	 * see bio.h and blkdev.h
-	 */
 	rq->cmd_flags = rw | REQ_ALLOCED;
 
 	if (priv) {
@@ -2012,7 +2008,8 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 
 	rq->nr_phys_segments = bio_phys_segments(q, bio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dbeb66f813ab..17f1fbdb31bf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -150,8 +150,8 @@ struct bio {
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
-#define BIO_RW		0
-#define BIO_RW_AHEAD	1
+#define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
+#define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 53ea933cf60b..e0ba018f5e88 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -84,7 +84,7 @@ enum {
 };
 
 /*
- * request type modified bits. first three bits match BIO_RW* bits, important
+ * request type modified bits. first two bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */

From fb2dce862d9f9a68e6b9374579056ec9eca02a63 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 5 Aug 2008 18:01:53 +0100
Subject: [PATCH 007/132] Add 'discard' request handling

Some block devices benefit from a hint that they can forget the contents
of certain sectors. Add basic support for this to the block core, along
with a 'blkdev_issue_discard()' helper function which issues such
requests.

The caller doesn't get to provide an end_io functio, since
blkdev_issue_discard() will automatically split the request up into
multiple bios if appropriate. Neither does the function wait for
completion -- it's expected that callers won't care about when, or even
_if_, the request completes. It's only a hint to the device anyway. By
definition, the file system doesn't _care_ about these sectors any more.

[With feedback from OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> and
Jens Axboe <jens.axboe@oracle.com]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 69 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c       | 28 ++++++++++++-----
 block/blk-settings.c   | 17 +++++++++++
 include/linux/bio.h    |  8 +++--
 include/linux/blkdev.h | 16 ++++++++++
 include/linux/fs.h     |  3 +-
 6 files changed, 130 insertions(+), 11 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index a09ead19f9c5..273121c0eb80 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -315,3 +315,72 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Does not wait.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+			 unsigned nr_sects)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	int ret = 0;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	if (!q->prepare_discard_fn)
+		return -EOPNOTSUPP;
+
+	while (nr_sects && !ret) {
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (!bio)
+			return -ENOMEM;
+
+		bio->bi_end_io = blkdev_discard_end_io;
+		bio->bi_bdev = bdev;
+
+		bio->bi_sector = sector;
+
+		if (nr_sects > q->max_hw_sectors) {
+			bio->bi_size = q->max_hw_sectors << 9;
+			nr_sects -= q->max_hw_sectors;
+			sector += q->max_hw_sectors;
+		} else {
+			bio->bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+		bio_get(bio);
+		submit_bio(WRITE_DISCARD, bio);
+
+		/* Check if it failed immediately */
+		if (bio_flagged(bio, BIO_EOPNOTSUPP))
+			ret = -EOPNOTSUPP;
+		else if (!bio_flagged(bio, BIO_UPTODATE))
+			ret = -EIO;
+		bio_put(bio);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
diff --git a/block/blk-core.c b/block/blk-core.c
index a496727df7ef..1e143c4f9d34 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1079,6 +1079,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	 */
 	if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+	if (unlikely(bio_discard(bio))) {
+		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD);
+		req->q->prepare_discard_fn(req->q, req);
+	}
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
@@ -1095,7 +1099,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, err;
+	int el_ret, nr_sectors, barrier, discard, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1115,6 +1119,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		goto end_io;
 	}
 
+	discard = bio_discard(bio);
+	if (unlikely(discard) && !q->prepare_discard_fn) {
+		err = -EOPNOTSUPP;
+		goto end_io;
+	}
+
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(barrier) || elv_queue_empty(q))
@@ -1405,7 +1415,8 @@ end_io:
 
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
-		if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+		if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
+		    (bio_discard(bio) && !q->prepare_discard_fn)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
@@ -1487,7 +1498,6 @@ void submit_bio(int rw, struct bio *bio)
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
-
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
@@ -1881,7 +1891,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (bio_has_data(rq->bio)) {
+	if (bio_has_data(rq->bio) || blk_discard_rq(rq)) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1939,7 +1949,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if (bio_has_data(rq->bio) &&
+	if ((bio_has_data(rq->bio) || blk_discard_rq(rq)) &&
 	    __end_that_request_first(rq, error, nr_bytes))
 		return 1;
 
@@ -2012,12 +2022,14 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 
-	rq->nr_phys_segments = bio_phys_segments(q, bio);
-	rq->nr_hw_segments = bio_hw_segments(q, bio);
+	if (bio_has_data(bio)) {
+		rq->nr_phys_segments = bio_phys_segments(q, bio);
+		rq->nr_hw_segments = bio_hw_segments(q, bio);
+		rq->buffer = bio_data(bio);
+	}
 	rq->current_nr_sectors = bio_cur_sectors(bio);
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-	rq->buffer = bio_data(bio);
 	rq->data_len = bio->bi_size;
 
 	rq->bio = rq->biotail = bio;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index dfc77012843f..539d873c820d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -32,6 +32,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
+/**
+ * blk_queue_set_discard - set a discard_sectors function for queue
+ * @q:		queue
+ * @dfn:	prepare_discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to transform a discard request into the appropriate type for the
+ * hardware. If none is registered, then discard requests are failed
+ * with %EOPNOTSUPP.
+ *
+ */
+void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
+{
+	q->prepare_discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_set_discard);
+
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 17f1fbdb31bf..1fdfc5621c83 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -149,6 +149,8 @@ struct bio {
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ * bit 5 -- metadata request
+ * bit 6 -- discard sectors
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
@@ -156,6 +158,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -186,13 +189,14 @@ struct bio {
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio))
+#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
 	if (bio->bi_vcnt)
 		return bio_iovec(bio)->bv_len >> 9;
-
-	return 0;
+	else /* dataless requests such as discard */
+		return bio->bi_size >> 9;
 }
 
 static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e0ba018f5e88..26ececbbebe2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -89,6 +89,7 @@ enum {
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
+	__REQ_DISCARD,		/* request to discard sectors */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
@@ -111,6 +112,7 @@ enum rq_flag_bits {
 };
 
 #define REQ_RW		(1 << __REQ_RW)
+#define REQ_DISCARD	(1 << __REQ_DISCARD)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
@@ -252,6 +254,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
+typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -307,6 +310,7 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
+	prepare_discard_fn	*prepare_discard_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
@@ -546,6 +550,7 @@ enum {
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -796,6 +801,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
@@ -837,6 +843,16 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+				unsigned nr_sects);
+
+static inline int sb_issue_discard(struct super_block *sb,
+				   sector_t block, unsigned nr_blocks)
+{
+	block <<= (sb->s_blocksize_bits - 9);
+	nr_blocks <<= (sb->s_blocksize_bits - 9);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks);
+}
 
 /*
 * command filter functions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 580b513668fe..eb0131319134 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -86,7 +86,8 @@ extern int dir_notify_enable;
 #define READ_META	(READ | (1 << BIO_RW_META))
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 #define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNC))
-#define WRITE_BARRIER	((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
+#define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
+#define WRITE_DISCARD	(WRITE | (1 << BIO_RW_DISCARD))
 
 #define SEL_IN		1
 #define SEL_OUT		2

From 8c540a96c175bdf55bda8707db04cec78b816454 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 5 Aug 2008 18:05:46 +0100
Subject: [PATCH 008/132] Let the block device know when sectors can be
 discarded

[hirofumi@mail.parknet.co.jp: discard _after_ checking for corrupt chains]

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fat/fatent.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7e..fb98b3d847ed 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
 	void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 	struct fat_entry fatent;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	int i, err, nr_bhs;
+	int first_cl = cluster;
 
 	nr_bhs = 0;
 	fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
 			goto error;
 		}
 
+		/* 
+		 * Issue discard for the sectors we no longer care about,
+		 * batching contiguous clusters into one request
+		 */
+		if (cluster != fatent.entry + 1) {
+			int nr_clus = fatent.entry - first_cl + 1;
+
+			sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+					 nr_clus * sbi->sec_per_clus);
+			first_cl = cluster;
+		}
+
 		ops->ent_put(&fatent, FAT_ENT_FREE);
 		if (sbi->free_clusters != -1) {
 			sbi->free_clusters++;

From eae9acd13a8d14b50c00a961fa959606f34bbd92 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 5 Aug 2008 18:08:25 +0100
Subject: [PATCH 009/132] Support 'discard sectors' operation in translation
 layer support core

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/mtd/mtd_blkdevs.c    | 16 ++++++++++++++++
 include/linux/blkdev.h       |  1 +
 include/linux/mtd/blktrans.h |  2 ++
 3 files changed, 19 insertions(+)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 9ff007c4962c..681d5aca2af4 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -32,6 +32,14 @@ struct mtd_blkcore_priv {
 	spinlock_t queue_lock;
 };
 
+static int blktrans_discard_request(struct request_queue *q,
+				    struct request *req)
+{
+	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+	req->cmd[0] = REQ_LB_OP_DISCARD;
+	return 0;
+}
+
 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 			       struct mtd_blktrans_dev *dev,
 			       struct request *req)
@@ -44,6 +52,10 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	buf = req->buffer;
 
+	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+	    req->cmd[0] == REQ_LB_OP_DISCARD)
+		return !tr->discard(dev, block, nsect);
+
 	if (!blk_fs_request(req))
 		return 0;
 
@@ -367,6 +379,10 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 
 	tr->blkcore_priv->rq->queuedata = tr;
 	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	if (tr->discard)
+		blk_queue_set_discard(tr->blkcore_priv->rq,
+				      blktrans_discard_request);
+
 	tr->blkshift = ffs(tr->blksize) - 1;
 
 	tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 26ececbbebe2..727886d25c4e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -81,6 +81,7 @@ enum {
 	 */
 	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
 	REQ_LB_OP_FLUSH = 0x41,		/* flush device */
+	REQ_LB_OP_DISCARD = 0x42,	/* discard sectors */
 };
 
 /*
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 310e61606415..8b4aa0523db7 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -41,6 +41,8 @@ struct mtd_blktrans_ops {
 		    unsigned long block, char *buffer);
 	int (*writesect)(struct mtd_blktrans_dev *dev,
 		     unsigned long block, char *buffer);
+	int (*discard)(struct mtd_blktrans_dev *dev,
+		       unsigned long block, unsigned nr_blocks);
 
 	/* Block layer ioctls */
 	int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);

From fdc53971bce56d299cb5f1f06ecbff30b34cbaf2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 5 Aug 2008 18:08:56 +0100
Subject: [PATCH 010/132] Support 'discard sectors' operation.

We can benefit from knowing that the file system no longer cares about
the contents of certain sectors, by throwing them away immediately and
then never having to garbage collect them, and using the extra free
space to make our operations more efficient. Do so.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/mtd/ftl.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index f34f20c78911..9bf581c4f740 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -1005,6 +1005,29 @@ static int ftl_writesect(struct mtd_blktrans_dev *dev,
 	return ftl_write((void *)dev, buf, block, 1);
 }
 
+static int ftl_discardsect(struct mtd_blktrans_dev *dev,
+			   unsigned long sector, unsigned nr_sects)
+{
+	partition_t *part = (void *)dev;
+	uint32_t bsize = 1 << part->header.EraseUnitSize;
+
+	DEBUG(1, "FTL erase sector %ld for %d sectors\n",
+	      sector, nr_sects);
+
+	while (nr_sects) {
+		uint32_t old_addr = part->VirtualBlockMap[sector];
+		if (old_addr != 0xffffffff) {
+			part->VirtualBlockMap[sector] = 0xffffffff;
+			part->EUNInfo[old_addr/bsize].Deleted++;
+			if (set_bam_entry(part, old_addr, 0))
+				return -EIO;
+		}
+		nr_sects--;
+		sector++;
+	}
+
+	return 0;
+}
 /*====================================================================*/
 
 static void ftl_freepart(partition_t *part)
@@ -1069,6 +1092,7 @@ static struct mtd_blktrans_ops ftl_tr = {
 	.blksize 	= SECTOR_SIZE,
 	.readsect	= ftl_readsect,
 	.writesect	= ftl_writesect,
+	.discard	= ftl_discardsect,
 	.getgeo		= ftl_getgeo,
 	.add_mtd	= ftl_add_mtd,
 	.remove_dev	= ftl_remove_dev,

From 27b29e86bf3d4b3cf6641a0efd78ed11a9b633b2 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 Aug 2008 11:21:57 +0100
Subject: [PATCH 011/132] blktrace: support discard requests

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 11 ++++++++++-
 include/linux/blktrace_api.h |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/block/blktrace.c b/block/blktrace.c
index eb9651ccb241..7495a84353e4 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -114,7 +114,13 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
 /*
  * Bio action bits of interest
  */
-static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
+static u32 bio_act[17] __read_mostly = { 
+	[1] = BLK_TC_ACT(BLK_TC_BARRIER),
+	[2] = BLK_TC_ACT(BLK_TC_SYNC),
+	[4] = BLK_TC_ACT(BLK_TC_AHEAD),
+	[8] = BLK_TC_ACT(BLK_TC_META),
+	[16] = BLK_TC_ACT(BLK_TC_DISCARD)
+};
 
 /*
  * More could be added as needed, taking care to increment the decrementer
@@ -128,6 +134,8 @@ static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_AC
 	(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
 #define trace_meta_bit(rw)	\
 	(((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
+#define trace_discard_bit(rw)	\
+	(((rw) & (1 << BIO_RW_DISCARD)) >> (BIO_RW_DISCARD - 4))
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -151,6 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= bio_act[trace_sync_bit(rw)];
 	what |= bio_act[trace_ahead_bit(rw)];
 	what |= bio_act[trace_meta_bit(rw)];
+	what |= bio_act[trace_discard_bit(rw)];
 
 	pid = tsk->pid;
 	if (unlikely(act_log_check(bt, what, sector, pid)))
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index d084b8d227a5..27da2cc682ee 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -21,6 +21,7 @@ enum blktrace_cat {
 	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
 	BLK_TC_AHEAD	= 1 << 11,	/* readahead */
 	BLK_TC_META	= 1 << 12,	/* metadata */
+	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
 
 	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
 };
@@ -195,6 +196,9 @@ static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (likely(!bt))
 		return;
 
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);

From 35ba8f7083e87602b695d6eaca38a6464d5b74db Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 10 Aug 2008 12:33:00 +0100
Subject: [PATCH 012/132] blktrace: simplify flags handling in __blk_add_trace

Let the compiler see what's going on, and it can all get a lot simpler.
On PPC64 this reduces the size of the code calculating these bits by
about 60%. On x86_64 it's less of a win -- only 40%.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c | 38 ++++++++------------------------------
 1 file changed, 8 insertions(+), 30 deletions(-)

diff --git a/block/blktrace.c b/block/blktrace.c
index 7495a84353e4..9e0212c90b29 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -111,31 +111,9 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  */
 static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
 
-/*
- * Bio action bits of interest
- */
-static u32 bio_act[17] __read_mostly = { 
-	[1] = BLK_TC_ACT(BLK_TC_BARRIER),
-	[2] = BLK_TC_ACT(BLK_TC_SYNC),
-	[4] = BLK_TC_ACT(BLK_TC_AHEAD),
-	[8] = BLK_TC_ACT(BLK_TC_META),
-	[16] = BLK_TC_ACT(BLK_TC_DISCARD)
-};
-
-/*
- * More could be added as needed, taking care to increment the decrementer
- * to get correct indexing
- */
-#define trace_barrier_bit(rw)	\
-	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
-#define trace_sync_bit(rw)	\
-	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
-#define trace_ahead_bit(rw)	\
-	(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
-#define trace_meta_bit(rw)	\
-	(((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
-#define trace_discard_bit(rw)	\
-	(((rw) & (1 << BIO_RW_DISCARD)) >> (BIO_RW_DISCARD - 4))
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -155,11 +133,11 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 
 	what |= ddir_act[rw & WRITE];
-	what |= bio_act[trace_barrier_bit(rw)];
-	what |= bio_act[trace_sync_bit(rw)];
-	what |= bio_act[trace_ahead_bit(rw)];
-	what |= bio_act[trace_meta_bit(rw)];
-	what |= bio_act[trace_discard_bit(rw)];
+	what |= MASK_TC_BIT(rw, BARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, META);
+	what |= MASK_TC_BIT(rw, DISCARD);
 
 	pid = tsk->pid;
 	if (unlikely(act_log_check(bt, what, sector, pid)))

From 2ebca85abcfcbaaf1c0b242e39fc88ad3da90090 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Mon, 11 Aug 2008 17:07:08 +0100
Subject: [PATCH 013/132] Use WRITE_BARRIER in blkdev_issue_flush(), not
 (1<<BIO_RW_BARRIER)

Barriers should be submitted with the WRITE flag set.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 273121c0eb80..e5448131d4f1 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -293,7 +293,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	bio->bi_end_io = bio_end_empty_barrier;
 	bio->bi_private = &wait;
 	bio->bi_bdev = bdev;
-	submit_bio(1 << BIO_RW_BARRIER, bio);
+	submit_bio(WRITE_BARRIER, bio);
 
 	wait_for_completion(&wait);
 

From d30a2605be9d5132d95944916e8f578fcfe4f976 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 11 Aug 2008 15:58:42 +0100
Subject: [PATCH 014/132] Add BLKDISCARD ioctl to allow userspace to discard
 sectors

We may well want mkfs tools to use this to mark the whole device as
unwanted before they format it, for example.

The ioctl takes a pair of uint64_ts, which are start offset and length
in _bytes_. Although at the moment it might make sense for them both to
be in 512-byte sectors, I don't want to limit the ABI to that.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/compat_ioctl.c |  1 +
 block/ioctl.c        | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h   |  1 +
 3 files changed, 78 insertions(+)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index c23177e4623f..1e559fba7bdf 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -788,6 +788,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
 	case BLKFLSBUF:
 	case BLKROSET:
+	case BLKDISCARD:
 	/*
 	 * the ones below are implemented in blkdev_locked_ioctl,
 	 * but we call blkdev_ioctl, which gets the lock for us
diff --git a/block/ioctl.c b/block/ioctl.c
index 77185e5c026a..342298bb6080 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -111,6 +111,69 @@ static int blkdev_reread_part(struct block_device *bdev)
 	return res;
 }
 
+static void blk_ioc_discard_endio(struct bio *bio, int err)
+{
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	}
+	complete(bio->bi_private);
+}
+
+static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
+			     uint64_t len)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	int ret = 0;
+
+	if (start & 511)
+		return -EINVAL;
+	if (len & 511)
+		return -EINVAL;
+	start >>= 9;
+	len >>= 9;
+
+	if (start + len > (bdev->bd_inode->i_size >> 9))
+		return -EINVAL;
+
+	if (!q->prepare_discard_fn)
+		return -EOPNOTSUPP;
+
+	while (len && !ret) {
+		DECLARE_COMPLETION_ONSTACK(wait);
+		struct bio *bio;
+
+		bio = bio_alloc(GFP_KERNEL, 0);
+		if (!bio)
+			return -ENOMEM;
+
+		bio->bi_end_io = blk_ioc_discard_endio;
+		bio->bi_bdev = bdev;
+		bio->bi_private = &wait;
+		bio->bi_sector = start;
+
+		if (len > q->max_hw_sectors) {
+			bio->bi_size = q->max_hw_sectors << 9;
+			len -= q->max_hw_sectors;
+			start += q->max_hw_sectors;
+		} else {
+			bio->bi_size = len << 9;
+			len = 0;
+		}
+		submit_bio(WRITE_DISCARD, bio);
+
+		wait_for_completion(&wait);
+
+		if (bio_flagged(bio, BIO_EOPNOTSUPP))
+			ret = -EOPNOTSUPP;
+		else if (!bio_flagged(bio, BIO_UPTODATE))
+			ret = -EIO;
+		bio_put(bio);
+	}
+	return ret;
+}
+
 static int put_ushort(unsigned long arg, unsigned short val)
 {
 	return put_user(val, (unsigned short __user *)arg);
@@ -258,6 +321,19 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		set_device_ro(bdev, n);
 		unlock_kernel();
 		return 0;
+
+	case BLKDISCARD: {
+		uint64_t range[2];
+
+		if (!(file->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+			return -EFAULT;
+
+		return blk_ioctl_discard(bdev, range[0], range[1]);
+	}
+
 	case HDIO_GETGEO: {
 		struct hd_geometry geo;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eb0131319134..88358ca6af25 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -223,6 +223,7 @@ extern int dir_notify_enable;
 #define BLKTRACESTART _IO(0x12,116)
 #define BLKTRACESTOP _IO(0x12,117)
 #define BLKTRACETEARDOWN _IO(0x12,118)
+#define BLKDISCARD _IO(0x12,119)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

From e17fc0a1ccf88f6d4dcb363729f3141b0958c325 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sat, 9 Aug 2008 16:42:20 +0100
Subject: [PATCH 015/132] Allow elevators to sort/merge discard requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

But blkdev_issue_discard() still emits requests which are interpreted as
soft barriers, because naïve callers might otherwise issue subsequent
writes to those same sectors, which might cross on the queue (if they're
reallocated quickly enough).

Callers still _can_ issue non-barrier discard requests, but they have to
take care of queue ordering for themselves.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    |  2 +-
 block/blk-core.c       | 12 +++++++-----
 block/blk-merge.c      | 27 +++++++++++++++++----------
 block/elevator.c       | 12 ++++++++++--
 block/ioctl.c          |  2 +-
 include/linux/bio.h    |  2 +-
 include/linux/blkdev.h |  5 +++--
 include/linux/fs.h     |  3 ++-
 8 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index e5448131d4f1..988b63479b2f 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -372,7 +372,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			nr_sects = 0;
 		}
 		bio_get(bio);
-		submit_bio(WRITE_DISCARD, bio);
+		submit_bio(DISCARD_BARRIER, bio);
 
 		/* Check if it failed immediately */
 		if (bio_flagged(bio, BIO_EOPNOTSUPP))
diff --git a/block/blk-core.c b/block/blk-core.c
index 1e143c4f9d34..1261516dd42a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1077,12 +1077,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	/*
 	 * REQ_BARRIER implies no merging, but lets make it explicit
 	 */
-	if (unlikely(bio_barrier(bio)))
-		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 	if (unlikely(bio_discard(bio))) {
-		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD);
+		req->cmd_flags |= REQ_DISCARD;
+		if (bio_barrier(bio))
+			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
-	}
+	} else if (unlikely(bio_barrier(bio)))
+		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
@@ -1114,7 +1115,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+	if (unlikely(barrier) && bio_has_data(bio) &&
+	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5efc9e7a68b7..6cf8f0c70a51 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -11,7 +11,7 @@
 
 void blk_recalc_rq_sectors(struct request *rq, int nsect)
 {
-	if (blk_fs_request(rq)) {
+	if (blk_fs_request(rq) || blk_discard_rq(rq)) {
 		rq->hard_sector += nsect;
 		rq->hard_nr_sectors -= nsect;
 
@@ -131,13 +131,17 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
 		return 0;
 
-	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
-		return 0;
 	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
 		return 0;
 
+	if (!bio_has_data(bio))
+		return 1;
+
+	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+		return 0;
+
 	/*
-	 * bio and nxt are contigous in memory, check if the queue allows
+	 * bio and nxt are contiguous in memory; check if the queue allows
 	 * these two to be merged into one
 	 */
 	if (BIO_SEG_BOUNDARY(q, bio, nxt))
@@ -153,8 +157,9 @@ static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
 		blk_recount_segments(q, bio);
 	if (!bio_flagged(nxt, BIO_SEG_VALID))
 		blk_recount_segments(q, nxt);
-	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
-	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
+	if (bio_has_data(bio) &&
+	    (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
+	     BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)))
 		return 0;
 	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
 		return 0;
@@ -317,8 +322,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
-	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
-	    && !BIOVEC_VIRT_OVERSIZE(len)) {
+	if (!bio_has_data(bio) || 
+	    (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
+	     && !BIOVEC_VIRT_OVERSIZE(len))) {
 		int mergeable =  ll_new_mergeable(q, req, bio);
 
 		if (mergeable) {
@@ -356,8 +362,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		blk_recount_segments(q, bio);
 	if (!bio_flagged(req->bio, BIO_SEG_VALID))
 		blk_recount_segments(q, req->bio);
-	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
-	    !BIOVEC_VIRT_OVERSIZE(len)) {
+	if (!bio_has_data(bio) || 
+	    (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
+	     !BIOVEC_VIRT_OVERSIZE(len))) {
 		int mergeable =  ll_new_mergeable(q, req, bio);
 
 		if (mergeable) {
diff --git a/block/elevator.c b/block/elevator.c
index ed6f8f32d27e..4f5127054e3f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -74,6 +74,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (!rq_mergeable(rq))
 		return 0;
 
+	/*
+	 * Don't merge file system requests and discard requests
+	 */
+	if (bio_discard(bio) != bio_discard(rq->bio))
+		return 0;
+
 	/*
 	 * different data direction or already started, don't merge
 	 */
@@ -438,6 +444,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
+		if (blk_discard_rq(rq) != blk_discard_rq(pos))
+			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
 		if (pos->cmd_flags & stop_flags)
@@ -607,7 +615,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		break;
 
 	case ELEVATOR_INSERT_SORT:
-		BUG_ON(!blk_fs_request(rq));
+		BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
@@ -692,7 +700,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		 * this request is scheduling boundary, update
 		 * end_sector
 		 */
-		if (blk_fs_request(rq)) {
+		if (blk_fs_request(rq) || blk_discard_rq(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
diff --git a/block/ioctl.c b/block/ioctl.c
index 342298bb6080..375c57922b00 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -161,7 +161,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			bio->bi_size = len << 9;
 			len = 0;
 		}
-		submit_bio(WRITE_DISCARD, bio);
+		submit_bio(DISCARD_NOBARRIER, bio);
 
 		wait_for_completion(&wait);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1fdfc5621c83..33c3947d61e9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -188,8 +188,8 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
-#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio))
 #define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
+#define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 727886d25c4e..e9eb35c9bf26 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -541,7 +541,7 @@ enum {
 #define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
 
-#define blk_account_rq(rq)	(blk_rq_started(rq) && blk_fs_request(rq))
+#define blk_account_rq(rq)	(blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
 
 #define blk_pm_suspend_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
 #define blk_pm_resume_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_RESUME)
@@ -598,7 +598,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int rw)
 #define RQ_NOMERGE_FLAGS	\
 	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)	\
-	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
+	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
+	 (blk_discard_rq(rq) || blk_fs_request((rq))))
 
 /*
  * q->prep_rq_fn return values
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 88358ca6af25..860689f541b1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -87,7 +87,8 @@ extern int dir_notify_enable;
 #define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 #define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNC))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
-#define WRITE_DISCARD	(WRITE | (1 << BIO_RW_DISCARD))
+#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
+#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN		1
 #define SEL_OUT		2

From 1a8e2bddd5c29008f311613e75925fecbf522c5b Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 13 Aug 2008 12:35:09 +0100
Subject: [PATCH 016/132] Kill REQ_TYPE_FLUSH

It was only used by ps3disk, and it should probably have been
REQ_TYPE_LINUX_BLOCK + REQ_LB_OP_FLUSH.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/ps3disk.c | 9 ++++++---
 include/linux/blkdev.h  | 6 +-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index d797e209951d..4b0d6c7f4c66 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -199,7 +199,8 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
-		} else if (req->cmd_type == REQ_TYPE_FLUSH) {
+		} else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+			   req->cmd[0] == REQ_LB_OP_FLUSH) {
 			if (ps3disk_submit_flush_request(dev, req))
 				break;
 		} else {
@@ -257,7 +258,8 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
-	if (req->cmd_type == REQ_TYPE_FLUSH) {
+	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+	    req->cmd[0] == REQ_LB_OP_FLUSH) {
 		read = 0;
 		num_sectors = req->hard_cur_sectors;
 		op = "flush";
@@ -405,7 +407,8 @@ static void ps3disk_prepare_flush(struct request_queue *q, struct request *req)
 
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
-	req->cmd_type = REQ_TYPE_FLUSH;
+	req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+	req->cmd[0] = REQ_LB_OP_FLUSH;
 }
 
 static unsigned long ps3disk_mask;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9eb35c9bf26..f131776f029e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -54,7 +54,6 @@ enum rq_cmd_type_bits {
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
-	REQ_TYPE_FLUSH,			/* flush request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
@@ -76,11 +75,8 @@ enum rq_cmd_type_bits {
  *
  */
 enum {
-	/*
-	 * just examples for now
-	 */
 	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
-	REQ_LB_OP_FLUSH = 0x41,		/* flush device */
+	REQ_LB_OP_FLUSH = 0x41,		/* flush request */
 	REQ_LB_OP_DISCARD = 0x42,	/* discard sectors */
 };
 

From 766ca4428d1239a970926856c447310c9c191af2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?=
 <fernando@oss.ntt.co.jp>
Date: Thu, 14 Aug 2008 09:59:13 +0200
Subject: [PATCH 017/132] virtio_blk: use a wrapper function to access io
 context information of IO requests

struct request has an ioprio member but it is never updated because
currently bios do not hold io context information. The implication of
this is that virtio_blk ends up passing useless information to the
backend driver.

That said, some IO schedulers such as CFQ do store io context
information in struct request, but use private members for that, which
means that that information cannot be directly accessed in a IO
scheduler-independent way.

This patch adds a function to obtain the ioprio of a request. We should
avoid accessing ioprio directly and use this function instead, so that
its users do not have to care about future changes in block layer
structures or what the currently active IO controller is.

This patch does not introduce any functional changes but paves the way
for future clean-ups and enhancements.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 4 ++--
 include/linux/blkdev.h     | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42251095134f..879506a2c234 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -84,11 +84,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	if (blk_fs_request(vbr->req)) {
 		vbr->out_hdr.type = 0;
 		vbr->out_hdr.sector = vbr->req->sector;
-		vbr->out_hdr.ioprio = vbr->req->ioprio;
+		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 	} else if (blk_pc_request(vbr->req)) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
 		vbr->out_hdr.sector = 0;
-		vbr->out_hdr.ioprio = vbr->req->ioprio;
+		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 	} else {
 		/* We don't put anything else in the queue. */
 		BUG();
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f131776f029e..490ce458b031 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -232,6 +232,11 @@ struct request {
 	struct request *next_rq;
 };
 
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+	return req->ioprio;
+}
+
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.

From 63de428b139d3d31d86ebe25ae97b33f6540fb7e Mon Sep 17 00:00:00 2001
From: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Date: Thu, 14 Aug 2008 18:17:13 +1000
Subject: [PATCH 018/132] deadline-iosched: allow non-sequential batching

Deadline currently only batches sector-contiguous requests, so except
for a few circumstances (e.g. requests in a single direction), it is
essentially first come first served.  This is bad for throughput, so
change it to CSCAN, which means requests in a batch do not need to be
sequential and are issued in increasing sector order.

Signed-off-by: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/deadline-iosched.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 342448c3d2dd..07b80e4642f9 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -258,17 +258,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
 	else
 		rq = dd->next_rq[READ];
 
-	if (rq) {
-		/* we have a "next request" */
-		
-		if (dd->last_sector != rq->sector)
-			/* end the batch on a non sequential request */
-			dd->batching += dd->fifo_batch;
-		
-		if (dd->batching < dd->fifo_batch)
-			/* we are still entitled to batch */
-			goto dispatch_request;
-	}
+	if (rq && dd->batching < dd->fifo_batch)
+		/* we have a next request are still entitled to batch */
+		goto dispatch_request;
 
 	/*
 	 * at this point we are not running a batch. select the appropriate

From 4fb72f7646e86874eb2798256eaa6bf3fbe4edcf Mon Sep 17 00:00:00 2001
From: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Date: Thu, 14 Aug 2008 18:17:14 +1000
Subject: [PATCH 019/132] deadline-iosched: non-functional fixes

* convert goto to simpler while loop;
 * use rq_end_sector() instead of computing manually;
 * fix false comments;
 * remove spurious whitespace;
 * convert rq_rb_root macro to an inline function.

Signed-off-by: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/deadline-iosched.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 07b80e4642f9..fd311179f44c 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -33,7 +33,7 @@ struct deadline_data {
 	 */
 	struct rb_root sort_list[2];	
 	struct list_head fifo_list[2];
-	
+
 	/*
 	 * next in sort order. read, write or both are NULL
 	 */
@@ -53,7 +53,11 @@ struct deadline_data {
 
 static void deadline_move_request(struct deadline_data *, struct request *);
 
-#define RQ_RB_ROOT(dd, rq)	(&(dd)->sort_list[rq_data_dir((rq))])
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+	return &dd->sort_list[rq_data_dir(rq)];
+}
 
 /*
  * get the request after `rq' in sector-sorted order
@@ -72,15 +76,11 @@ deadline_latter_request(struct request *rq)
 static void
 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-	struct rb_root *root = RQ_RB_ROOT(dd, rq);
+	struct rb_root *root = deadline_rb_root(dd, rq);
 	struct request *__alias;
 
-retry:
-	__alias = elv_rb_add(root, rq);
-	if (unlikely(__alias)) {
+	while (unlikely(__alias = elv_rb_add(root, rq)))
 		deadline_move_request(dd, __alias);
-		goto retry;
-	}
 }
 
 static inline void
@@ -91,7 +91,7 @@ deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
 	if (dd->next_rq[data_dir] == rq)
 		dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-	elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
+	elv_rb_del(deadline_rb_root(dd, rq), rq);
 }
 
 /*
@@ -106,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)
 	deadline_add_rq_rb(dd, rq);
 
 	/*
-	 * set expire time (only used for reads) and add to fifo list
+	 * set expire time and add to fifo list
 	 */
 	rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
 	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
@@ -162,7 +162,7 @@ static void deadline_merged_request(struct request_queue *q,
 	 * if the merge was a front merge, we need to reposition request
 	 */
 	if (type == ELEVATOR_FRONT_MERGE) {
-		elv_rb_del(RQ_RB_ROOT(dd, req), req);
+		elv_rb_del(deadline_rb_root(dd, req), req);
 		deadline_add_rq_rb(dd, req);
 	}
 }
@@ -212,7 +212,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
 	dd->next_rq[WRITE] = NULL;
 	dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-	dd->last_sector = rq->sector + rq->nr_sectors;
+	dd->last_sector = rq_end_sector(rq);
 
 	/*
 	 * take it off the sort and fifo list, move
@@ -222,7 +222,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
 }
 
 /*
- * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
  */
 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)

From 6a421c1dc94b12923294a359822346f12492de5e Mon Sep 17 00:00:00 2001
From: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Date: Thu, 14 Aug 2008 18:17:15 +1000
Subject: [PATCH 020/132] block: update documentation for deadline fifo_batch
 tunable

Update the description of fifo_batch to match the current implementation,
and include a description of how to tune it.

Signed-off-by: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/block/deadline-iosched.txt | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index c23cab13c3d1..72576769e0f4 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -30,12 +30,18 @@ write_expire	(in ms)
 Similar to read_expire mentioned above, but for writes.
 
 
-fifo_batch
+fifo_batch	(number of requests)
 ----------
 
-When a read request expires its deadline, we must move some requests from
-the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move.
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order.  To limit extra seeking,
+deadline expiries are only checked between batches.  fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput.  When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour).  Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
 
 
 writes_starved	(number of dispatches)

From b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Aug 2008 10:15:19 +0200
Subject: [PATCH 021/132] block: drop virtual merging accounting

Remove virtual merge accounting.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-merge.c   | 79 ++++-----------------------------------------
 fs/bio.c            |  6 ++--
 include/linux/bio.h | 15 ---------
 3 files changed, 8 insertions(+), 92 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6cf8f0c70a51..2c2a2ee716ec 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -66,7 +66,7 @@ void blk_recalc_rq_segments(struct request *rq)
 		 */
 		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
 		if (high || highprv)
-			goto new_hw_segment;
+			goto new_segment;
 		if (cluster) {
 			if (seg_size + bv->bv_len > q->max_segment_size)
 				goto new_segment;
@@ -74,8 +74,6 @@ void blk_recalc_rq_segments(struct request *rq)
 				goto new_segment;
 			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
 				goto new_segment;
-			if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-				goto new_hw_segment;
 
 			seg_size += bv->bv_len;
 			hw_seg_size += bv->bv_len;
@@ -83,17 +81,11 @@ void blk_recalc_rq_segments(struct request *rq)
 			continue;
 		}
 new_segment:
-		if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
-		    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-			hw_seg_size += bv->bv_len;
-		else {
-new_hw_segment:
-			if (nr_hw_segs == 1 &&
-			    hw_seg_size > rq->bio->bi_hw_front_size)
-				rq->bio->bi_hw_front_size = hw_seg_size;
-			hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
-			nr_hw_segs++;
-		}
+		if (nr_hw_segs == 1 &&
+		    hw_seg_size > rq->bio->bi_hw_front_size)
+			rq->bio->bi_hw_front_size = hw_seg_size;
+		hw_seg_size = bv->bv_len;
+		nr_hw_segs++;
 
 		nr_phys_segs++;
 		bvprv = bv;
@@ -150,23 +142,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	return 0;
 }
 
-static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
-				 struct bio *nxt)
-{
-	if (!bio_flagged(bio, BIO_SEG_VALID))
-		blk_recount_segments(q, bio);
-	if (!bio_flagged(nxt, BIO_SEG_VALID))
-		blk_recount_segments(q, nxt);
-	if (bio_has_data(bio) &&
-	    (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
-	     BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)))
-		return 0;
-	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
-		return 0;
-
-	return 1;
-}
-
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
@@ -304,7 +279,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio)
 {
 	unsigned short max_sectors;
-	int len;
 
 	if (unlikely(blk_pc_request(req)))
 		max_sectors = q->max_hw_sectors;
@@ -321,20 +295,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		blk_recount_segments(q, req->biotail);
 	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
-	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
-	if (!bio_has_data(bio) || 
-	    (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
-	     && !BIOVEC_VIRT_OVERSIZE(len))) {
-		int mergeable =  ll_new_mergeable(q, req, bio);
-
-		if (mergeable) {
-			if (req->nr_hw_segments == 1)
-				req->bio->bi_hw_front_size = len;
-			if (bio->bi_hw_segments == 1)
-				bio->bi_hw_back_size = len;
-		}
-		return mergeable;
-	}
 
 	return ll_new_hw_segment(q, req, bio);
 }
@@ -343,7 +303,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio)
 {
 	unsigned short max_sectors;
-	int len;
 
 	if (unlikely(blk_pc_request(req)))
 		max_sectors = q->max_hw_sectors;
@@ -357,24 +316,10 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 			q->last_merge = NULL;
 		return 0;
 	}
-	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
 	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	if (!bio_flagged(req->bio, BIO_SEG_VALID))
 		blk_recount_segments(q, req->bio);
-	if (!bio_has_data(bio) || 
-	    (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
-	     !BIOVEC_VIRT_OVERSIZE(len))) {
-		int mergeable =  ll_new_mergeable(q, req, bio);
-
-		if (mergeable) {
-			if (bio->bi_hw_segments == 1)
-				bio->bi_hw_front_size = len;
-			if (req->nr_hw_segments == 1)
-				req->biotail->bi_hw_back_size = len;
-		}
-		return mergeable;
-	}
 
 	return ll_new_hw_segment(q, req, bio);
 }
@@ -406,18 +351,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		return 0;
 
 	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
-	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
-		int len = req->biotail->bi_hw_back_size +
-				next->bio->bi_hw_front_size;
-		/*
-		 * propagate the combined length to the end of the requests
-		 */
-		if (req->nr_hw_segments == 1)
-			req->bio->bi_hw_front_size = len;
-		if (next->nr_hw_segments == 1)
-			next->biotail->bi_hw_back_size = len;
-		total_hw_segments--;
-	}
 
 	if (total_hw_segments > q->max_hw_segments)
 		return 0;
diff --git a/fs/bio.c b/fs/bio.c
index 3cba7ae34d75..4ac7c59d1c6d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -350,8 +350,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 */
 
 	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_hw_segments >= q->max_hw_segments
-	       || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+	       || bio->bi_hw_segments >= q->max_hw_segments) {
 
 		if (retried_segments)
 			return 0;
@@ -395,8 +394,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	}
 
 	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
-	    BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
 		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
 	bio->bi_vcnt++;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 33c3947d61e9..894d16ce0020 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -26,21 +26,8 @@
 
 #ifdef CONFIG_BLOCK
 
-/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
 #include <asm/io.h>
 
-#if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY)
-#define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1))
-#define BIOVEC_VIRT_OVERSIZE(x)	((x) > BIO_VMERGE_MAX_SIZE)
-#else
-#define BIOVEC_VIRT_START_SIZE(x)	0
-#define BIOVEC_VIRT_OVERSIZE(x)		0
-#endif
-
-#ifndef BIO_VMERGE_BOUNDARY
-#define BIO_VMERGE_BOUNDARY	0
-#endif
-
 #define BIO_DEBUG
 
 #ifdef BIO_DEBUG
@@ -240,8 +227,6 @@ static inline void *bio_data(struct bio *bio)
 	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
 #endif
 
-#define BIOVEC_VIRT_MERGEABLE(vec1, vec2)	\
-	((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \

From 5df97b91b5d7ed426034fcc84cb6e7cf682b8838 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 15 Aug 2008 10:20:02 +0200
Subject: [PATCH 022/132] drop vmerge accounting

Remove hw_segments field from struct bio and struct request. Without virtual
merge accounting they have no purpose.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       |  1 -
 block/blk-merge.c      | 31 ++++---------------------------
 block/elevator.c       |  2 --
 drivers/md/raid1.c     |  3 ---
 drivers/md/raid10.c    |  3 ---
 fs/bio.c               | 12 +-----------
 include/linux/bio.h    | 16 +---------------
 include/linux/blkdev.h |  7 -------
 8 files changed, 6 insertions(+), 69 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1261516dd42a..2616cdd049a8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2026,7 +2026,6 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
-		rq->nr_hw_segments = bio_hw_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
 	rq->current_nr_sectors = bio_cur_sectors(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2c2a2ee716ec..d81d91419ff5 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -41,12 +41,9 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 void blk_recalc_rq_segments(struct request *rq)
 {
 	int nr_phys_segs;
-	int nr_hw_segs;
 	unsigned int phys_size;
-	unsigned int hw_size;
 	struct bio_vec *bv, *bvprv = NULL;
 	int seg_size;
-	int hw_seg_size;
 	int cluster;
 	struct req_iterator iter;
 	int high, highprv = 1;
@@ -56,8 +53,8 @@ void blk_recalc_rq_segments(struct request *rq)
 		return;
 
 	cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
-	hw_seg_size = seg_size = 0;
-	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+	seg_size = 0;
+	phys_size = nr_phys_segs = 0;
 	rq_for_each_segment(bv, rq, iter) {
 		/*
 		 * the trick here is making sure that a high page is never
@@ -76,30 +73,17 @@ void blk_recalc_rq_segments(struct request *rq)
 				goto new_segment;
 
 			seg_size += bv->bv_len;
-			hw_seg_size += bv->bv_len;
 			bvprv = bv;
 			continue;
 		}
 new_segment:
-		if (nr_hw_segs == 1 &&
-		    hw_seg_size > rq->bio->bi_hw_front_size)
-			rq->bio->bi_hw_front_size = hw_seg_size;
-		hw_seg_size = bv->bv_len;
-		nr_hw_segs++;
-
 		nr_phys_segs++;
 		bvprv = bv;
 		seg_size = bv->bv_len;
 		highprv = high;
 	}
 
-	if (nr_hw_segs == 1 &&
-	    hw_seg_size > rq->bio->bi_hw_front_size)
-		rq->bio->bi_hw_front_size = hw_seg_size;
-	if (hw_seg_size > rq->biotail->bi_hw_back_size)
-		rq->biotail->bi_hw_back_size = hw_seg_size;
 	rq->nr_phys_segments = nr_phys_segs;
-	rq->nr_hw_segments = nr_hw_segs;
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
@@ -112,7 +96,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 	blk_recalc_rq_segments(&rq);
 	bio->bi_next = nxt;
 	bio->bi_phys_segments = rq.nr_phys_segments;
-	bio->bi_hw_segments = rq.nr_hw_segments;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
@@ -255,10 +238,9 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
 {
-	int nr_hw_segs = bio_hw_segments(q, bio);
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
+	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
 	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
@@ -270,7 +252,6 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 	 * This will form the start of a new hw segment.  Bump both
 	 * counters.
 	 */
-	req->nr_hw_segments += nr_hw_segs;
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
 }
@@ -328,7 +309,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
-	int total_hw_segments;
 
 	/*
 	 * First check if the either of the requests are re-queued
@@ -350,14 +330,11 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	if (total_phys_segments > q->max_phys_segments)
 		return 0;
 
-	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
-
-	if (total_hw_segments > q->max_hw_segments)
+	if (total_phys_segments > q->max_hw_segments)
 		return 0;
 
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
-	req->nr_hw_segments = total_hw_segments;
 	return 1;
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 4f5127054e3f..269615e6dbf5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -790,7 +790,6 @@ struct request *elv_next_request(struct request_queue *q)
 			 * device can handle
 			 */
 			rq->nr_phys_segments++;
-			rq->nr_hw_segments++;
 		}
 
 		if (!q->prep_rq_fn)
@@ -813,7 +812,6 @@ struct request *elv_next_request(struct request_queue *q)
 				 * so that we don't add it again
 				 */
 				--rq->nr_phys_segments;
-				--rq->nr_hw_segments;
 			}
 
 			rq = NULL;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 03a5ab705c20..28a3869dcfd2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1302,9 +1302,6 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 					sbio->bi_size = r1_bio->sectors << 9;
 					sbio->bi_idx = 0;
 					sbio->bi_phys_segments = 0;
-					sbio->bi_hw_segments = 0;
-					sbio->bi_hw_front_size = 0;
-					sbio->bi_hw_back_size = 0;
 					sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
 					sbio->bi_flags |= 1 << BIO_UPTODATE;
 					sbio->bi_next = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e34cd0e62473..0f40688503e7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1345,9 +1345,6 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 		tbio->bi_size = r10_bio->sectors << 9;
 		tbio->bi_idx = 0;
 		tbio->bi_phys_segments = 0;
-		tbio->bi_hw_segments = 0;
-		tbio->bi_hw_front_size = 0;
-		tbio->bi_hw_back_size = 0;
 		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		tbio->bi_flags |= 1 << BIO_UPTODATE;
 		tbio->bi_next = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 4ac7c59d1c6d..bee4deca774a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -208,14 +208,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 	return bio->bi_phys_segments;
 }
 
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_hw_segments;
-}
-
 /**
  * 	__bio_clone	-	clone a bio
  * 	@bio: destination bio
@@ -350,7 +342,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 */
 
 	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_hw_segments >= q->max_hw_segments) {
+	       || bio->bi_phys_segments >= q->max_hw_segments) {
 
 		if (retried_segments)
 			return 0;
@@ -399,7 +391,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 
 	bio->bi_vcnt++;
 	bio->bi_phys_segments++;
-	bio->bi_hw_segments++;
  done:
 	bio->bi_size += len;
 	return len;
@@ -1381,7 +1372,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 894d16ce0020..dfc3556d311c 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -77,21 +77,8 @@ struct bio {
 	 */
 	unsigned short		bi_phys_segments;
 
-	/* Number of segments after physical and DMA remapping
-	 * hardware coalescing is performed.
-	 */
-	unsigned short		bi_hw_segments;
-
 	unsigned int		bi_size;	/* residual I/O count */
 
-	/*
-	 * To keep track of the max hw size, we account for the
-	 * sizes of the first and last virtually mergeable segments
-	 * in this bio
-	 */
-	unsigned int		bi_hw_front_size;
-	unsigned int		bi_hw_back_size;
-
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
@@ -113,7 +100,7 @@ struct bio {
 #define BIO_UPTODATE	0	/* ok after I/O completion */
 #define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
 #define BIO_EOF		2	/* out-out-bounds error */
-#define BIO_SEG_VALID	3	/* nr_hw_seg valid */
+#define BIO_SEG_VALID	3	/* bi_phys_segments valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
@@ -324,7 +311,6 @@ extern void bio_free(struct bio *, struct bio_set *);
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
-extern int bio_hw_segments(struct request_queue *, struct bio *);
 
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone(struct bio *, gfp_t);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 490ce458b031..1adb03827bd3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -189,13 +189,6 @@ struct request {
 	 */
 	unsigned short nr_phys_segments;
 
-	/* Number of scatter-gather addr+len pairs after
-	 * physical and DMA remapping hardware coalescing is performed.
-	 * This is the number of scatter-gather entries the driver
-	 * will actually have to deal with after DMA mapping is done.
-	 */
-	unsigned short nr_hw_segments;
-
 	unsigned short ioprio;
 
 	void *special;

From 960e739d9e9f1c2346d8bdc65299ee2e1ed42218 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 15 Aug 2008 10:41:18 +0200
Subject: [PATCH 023/132] block: raid fixups for removal of bi_hw_segments

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/raid1.c  |  1 -
 drivers/md/raid10.c |  1 -
 drivers/md/raid5.c  | 66 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 28a3869dcfd2..0b82030c265d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1787,7 +1787,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
 		bio->bi_phys_segments = 0;
-		bio->bi_hw_segments = 0;
 		bio->bi_size = 0;
 		bio->bi_end_io = NULL;
 		bio->bi_private = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f40688503e7..d3b9aa096285 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1944,7 +1944,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
 		bio->bi_phys_segments = 0;
-		bio->bi_hw_segments = 0;
 		bio->bi_size = 0;
 	}
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 224de022e7c5..05b22925cce4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -101,6 +101,40 @@
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 #endif
 
+/*
+ * We maintain a biased count of active stripes in the bottom 8 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 8 bits
+ */
+static inline int raid5_bi_phys_segments(struct bio *bio)
+{
+	return bio->bi_phys_segments & 0xff;
+}
+
+static inline int raid5_bi_hw_segments(struct bio *bio)
+{
+	return (bio->bi_phys_segments >> 8) & 0xff;
+}
+
+static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+{
+	--bio->bi_phys_segments;
+	return raid5_bi_phys_segments(bio);
+}
+
+static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+{
+	unsigned short val = raid5_bi_hw_segments(bio);
+
+	--val;
+	bio->bi_phys_segments = (val << 8) | raid5_bi_phys_segments(bio);
+	return val;
+}
+
+static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+{
+	bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 8);
+}
+
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
 	disk++;
@@ -507,7 +541,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (--rbi->bi_phys_segments == 0) {
+				if (!raid5_dec_bi_phys_segments(rbi)) {
 					rbi->bi_next = return_bi;
 					return_bi = rbi;
 				}
@@ -1725,7 +1759,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	if (*bip)
 		bi->bi_next = *bip;
 	*bip = bi;
-	bi->bi_phys_segments ++;
+	bi->bi_phys_segments++;
 	spin_unlock_irq(&conf->device_lock);
 	spin_unlock(&sh->lock);
 
@@ -1819,7 +1853,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
-			if (--bi->bi_phys_segments == 0) {
+			if (!raid5_dec_bi_phys_segments(bi)) {
 				md_write_end(conf->mddev);
 				bi->bi_next = *return_bi;
 				*return_bi = bi;
@@ -1834,7 +1868,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
-			if (--bi->bi_phys_segments == 0) {
+			if (!raid5_dec_bi_phys_segments(bi)) {
 				md_write_end(conf->mddev);
 				bi->bi_next = *return_bi;
 				*return_bi = bi;
@@ -1858,7 +1892,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (--bi->bi_phys_segments == 0) {
+				if (!raid5_dec_bi_phys_segments(bi)) {
 					bi->bi_next = *return_bi;
 					*return_bi = bi;
 				}
@@ -2033,7 +2067,7 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
 				while (wbi && wbi->bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
-					if (--wbi->bi_phys_segments == 0) {
+					if (!raid5_dec_bi_phys_segments(wbi)) {
 						md_write_end(conf->mddev);
 						wbi->bi_next = *return_bi;
 						*return_bi = wbi;
@@ -2814,7 +2848,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 				copy_data(0, rbi, dev->page, dev->sector);
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				spin_lock_irq(&conf->device_lock);
-				if (--rbi->bi_phys_segments == 0) {
+				if (!raid5_dec_bi_phys_segments(rbi)) {
 					rbi->bi_next = return_bi;
 					return_bi = rbi;
 				}
@@ -3155,8 +3189,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
 	if(bi) {
 		conf->retry_read_aligned_list = bi->bi_next;
 		bi->bi_next = NULL;
+		/*
+		 * this sets the active strip count to 1 and the processed
+		 * strip count to zero (upper 8 bits)
+		 */
 		bi->bi_phys_segments = 1; /* biased count of active stripes */
-		bi->bi_hw_segments = 0; /* count of processed stripes */
 	}
 
 	return bi;
@@ -3206,8 +3243,7 @@ static int bio_fits_rdev(struct bio *bi)
 	if ((bi->bi_size>>9) > q->max_sectors)
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > q->max_phys_segments ||
-	    bi->bi_hw_segments > q->max_hw_segments)
+	if (bi->bi_phys_segments > q->max_phys_segments)
 		return 0;
 
 	if (q->merge_bvec_fn)
@@ -3468,7 +3504,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 			
 	}
 	spin_lock_irq(&conf->device_lock);
-	remaining = --bi->bi_phys_segments;
+	remaining = raid5_dec_bi_phys_segments(bi);
 	spin_unlock_irq(&conf->device_lock);
 	if (remaining == 0) {
 
@@ -3752,7 +3788,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 		     sector += STRIPE_SECTORS,
 		     scnt++) {
 
-		if (scnt < raid_bio->bi_hw_segments)
+		if (scnt < raid5_bi_hw_segments(raid_bio))
 			/* already done this stripe */
 			continue;
 
@@ -3760,7 +3796,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-			raid_bio->bi_hw_segments = scnt;
+			raid5_set_bi_hw_segments(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
 			return handled;
 		}
@@ -3768,7 +3804,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 		set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
 			release_stripe(sh);
-			raid_bio->bi_hw_segments = scnt;
+			raid5_set_bi_hw_segments(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
 			return handled;
 		}
@@ -3778,7 +3814,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 		handled++;
 	}
 	spin_lock_irq(&conf->device_lock);
-	remaining = --raid_bio->bi_phys_segments;
+	remaining = raid5_dec_bi_phys_segments(raid_bio);
 	spin_unlock_irq(&conf->device_lock);
 	if (remaining == 0)
 		bio_endio(raid_bio, 0);

From 5b99c2ffa980528a197f26c7d876cceeccce8dd5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 15 Aug 2008 10:56:11 +0200
Subject: [PATCH 024/132] block: make bi_phys_segments an unsigned int instead
 of short

raid5 can overflow with more than 255 stripes, and we can increase it
to an int for free on both 32 and 64-bit archs due to the padding.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/raid5.c  | 12 ++++++------
 include/linux/bio.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 05b22925cce4..37e546528f9c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -102,17 +102,17 @@ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 #endif
 
 /*
- * We maintain a biased count of active stripes in the bottom 8 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 8 bits
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
  */
 static inline int raid5_bi_phys_segments(struct bio *bio)
 {
-	return bio->bi_phys_segments & 0xff;
+	return bio->bi_phys_segments & 0xffff;
 }
 
 static inline int raid5_bi_hw_segments(struct bio *bio)
 {
-	return (bio->bi_phys_segments >> 8) & 0xff;
+	return (bio->bi_phys_segments >> 16) & 0xffff;
 }
 
 static inline int raid5_dec_bi_phys_segments(struct bio *bio)
@@ -126,13 +126,13 @@ static inline int raid5_dec_bi_hw_segments(struct bio *bio)
 	unsigned short val = raid5_bi_hw_segments(bio);
 
 	--val;
-	bio->bi_phys_segments = (val << 8) | raid5_bi_phys_segments(bio);
+	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
 	return val;
 }
 
 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 {
-	bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 8);
+	bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
 }
 
 static inline int raid6_next_disk(int disk, int raid_disks)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index dfc3556d311c..2c0c09034fd2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -75,7 +75,7 @@ struct bio {
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
-	unsigned short		bi_phys_segments;
+	unsigned int		bi_phys_segments;
 
 	unsigned int		bi_size;	/* residual I/O count */
 

From 710027a48ede75428cc68eaa8ae2269b1e356e2c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 19 Aug 2008 20:13:11 +0200
Subject: [PATCH 025/132] Add some block/ source files to the kernel-api
 docbook. Fix kernel-doc notation in them as needed. Fix changed function
 parameter names. Fix typos/spellos. In comments, change REQ_SPECIAL to
 REQ_TYPE_SPECIAL and REQ_BLOCK_PC to REQ_TYPE_BLOCK_PC.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/DocBook/kernel-api.tmpl |  4 ++
 block/blk-core.c                      | 72 +++++++++++++--------------
 block/blk-exec.c                      |  6 +--
 block/blk-integrity.c                 |  4 +-
 block/blk-map.c                       | 16 +++---
 block/blk-settings.c                  |  8 +--
 block/blk-tag.c                       |  8 +--
 block/genhd.c                         |  5 +-
 8 files changed, 64 insertions(+), 59 deletions(-)

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index b7b1482f6e04..f5696ba9ae96 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -364,6 +364,10 @@ X!Edrivers/pnp/system.c
 !Eblock/blk-barrier.c
 !Eblock/blk-tag.c
 !Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
   </chapter>
 
   <chapter id="chrdev">
diff --git a/block/blk-core.c b/block/blk-core.c
index 2616cdd049a8..86d22e7d65c5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -531,7 +531,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
- *    Function returns a pointer to the initialized request queue, or NULL if
+ *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
@@ -913,7 +913,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL(blk_requeue_request);
 
 /**
- * blk_insert_request - insert a special request in to a request queue
+ * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
@@ -923,8 +923,8 @@ EXPORT_SYMBOL(blk_requeue_request);
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_SPECIAL in to the corresponding request queue, and letting them be
- *    scheduled for actual execution by the request queue.
+ *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
+ *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
@@ -1322,7 +1322,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 }
 
 /**
- * generic_make_request: hand a buffer to its device driver for I/O
+ * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
@@ -1480,13 +1480,13 @@ void generic_make_request(struct bio *bio)
 EXPORT_SYMBOL(generic_make_request);
 
 /**
- * submit_bio: submit a bio to the block device layer for I/O
+ * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
- * interfaces, @bio must be presetup and ready for I/O.
+ * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
@@ -1524,7 +1524,7 @@ EXPORT_SYMBOL(submit_bio);
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -1532,8 +1532,8 @@ EXPORT_SYMBOL(submit_bio);
  *     for the next range of segments (if any) in the cluster.
  *
  * Return:
- *     0 - we are done with this request, call end_that_request_last()
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request, call end_that_request_last()
+ *     %1 - still buffers pending for this request
  **/
 static int __end_that_request_first(struct request *req, int error,
 				    int nr_bytes)
@@ -1544,7 +1544,7 @@ static int __end_that_request_first(struct request *req, int error,
 	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
 
 	/*
-	 * for a REQ_BLOCK_PC request, we want to carry any eventual
+	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
 	 * sense key with us all the way through
 	 */
 	if (!blk_pc_request(req))
@@ -1810,11 +1810,11 @@ EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 /**
  * end_queued_request - end all I/O on a queued request
  * @rq:		the request being processed
- * @uptodate:	error value or 0/1 uptodate flag
+ * @uptodate:	error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends all I/O on a request, and removes it from the block layer queues.
- *     Not suitable for normal IO completion, unless the driver still has
+ *     Not suitable for normal I/O completion, unless the driver still has
  *     the request attached to the block layer.
  *
  **/
@@ -1827,7 +1827,7 @@ EXPORT_SYMBOL(end_queued_request);
 /**
  * end_dequeued_request - end all I/O on a dequeued request
  * @rq:		the request being processed
- * @uptodate:	error value or 0/1 uptodate flag
+ * @uptodate:	error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends all I/O on a request. The request must already have been
@@ -1845,14 +1845,14 @@ EXPORT_SYMBOL(end_dequeued_request);
 /**
  * end_request - end I/O on the current segment of the request
  * @req:	the request being processed
- * @uptodate:	error value or 0/1 uptodate flag
+ * @uptodate:	error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends I/O on the current segment of a request. If that is the only
  *     remaining segment, the request is also completed and freed.
  *
- *     This is a remnant of how older block drivers handled IO completions.
- *     Modern drivers typically end IO on the full request in one go, unless
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
  *     they have a residual value to account for. For that case this function
  *     isn't really useful, unless the residual just happens to be the
  *     full current segment. In other words, don't use this function in new
@@ -1870,12 +1870,12 @@ EXPORT_SYMBOL(end_request);
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
@@ -1883,8 +1883,8 @@ EXPORT_SYMBOL(end_request);
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet, it still has pending buffers.
+ *     %0 - we are done with this request
+ *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 		      unsigned int bidi_bytes,
@@ -1919,7 +1919,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -1927,8 +1927,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
@@ -1939,15 +1939,15 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Must be called with queue lock held unlike blk_end_request().
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
@@ -1966,7 +1966,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * blk_end_bidi_request - Helper function for drivers to complete bidi request.
  * @rq:         the bidi request being processed
- * @error:      0 for success, < 0 for error
+ * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
@@ -1974,8 +1974,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 			 unsigned int bidi_bytes)
@@ -1987,11 +1987,11 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 /**
  * blk_end_request_callback - Special helper function for tricky drivers
  * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
@@ -2004,10 +2004,10 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
  *     Don't use this interface in other places anymore.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet.
- *         this request still has pending buffers or
- *         the driver doesn't want to finish this request yet.
+ *     %0 - we are done with this request
+ *     %1 - this request is not freed yet.
+ *          this request still has pending buffers or
+ *          the driver doesn't want to finish this request yet.
  **/
 int blk_end_request_callback(struct request *rq, int error,
 			     unsigned int nr_bytes,
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9bceff7674f2..6af716d1e54e 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -16,7 +16,7 @@
 /**
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
- * @error: end io status of the request
+ * @error: end I/O status of the request
  */
 static void blk_end_sync_rq(struct request *rq, int error)
 {
@@ -41,7 +41,7 @@ static void blk_end_sync_rq(struct request *rq, int error)
  * @done:	I/O completion handler
  *
  * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution.  Don't wait for completion.
  */
 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  * @at_head:    insert request at head or tail of queue
  *
  * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 3f1a8478cc38..d87606eaca1d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -109,8 +109,8 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
 
 /**
  * blk_integrity_compare - Compare integrity profile of two block devices
- * @b1:		Device to compare
- * @b2:		Device to compare
+ * @bd1:	Device to compare
+ * @bd2:	Device to compare
  *
  * Description: Meta-devices like DM and MD need to verify that all
  * sub-devices use the same integrity format before advertising to
diff --git a/block/blk-map.c b/block/blk-map.c
index af37e4ae62f5..ea1bf53929e4 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -85,17 +85,17 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 }
 
 /**
- * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request structure to fill
  * @ubuf:	the user buffer
  * @len:	length of user data
  *
  * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -154,7 +154,7 @@ unmap_rq:
 EXPORT_SYMBOL(blk_rq_map_user);
 
 /**
- * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @iov:	pointer to the iovec
@@ -162,10 +162,10 @@ EXPORT_SYMBOL(blk_rq_map_user);
  * @len:	I/O byte count
  *
  * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -224,7 +224,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
  * Description:
  *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
  *    supply the original rq->bio from the blk_rq_map_user() return, since
- *    the io completion may have changed rq->bio.
+ *    the I/O completion may have changed rq->bio.
  */
 int blk_rq_unmap_user(struct bio *bio)
 {
@@ -250,7 +250,7 @@ int blk_rq_unmap_user(struct bio *bio)
 EXPORT_SYMBOL(blk_rq_unmap_user);
 
 /**
- * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request to fill
  * @kbuf:	the kernel buffer
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 539d873c820d..d70692badcdb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(blk_queue_make_request);
  *    Different hardware can have different requirements as to what pages
  *    it can do I/O directly to. A low level driver can call
  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
- *    buffers for doing I/O to pages residing above @page.
+ *    buffers for doing I/O to pages residing above @dma_addr.
  **/
 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
 {
@@ -229,7 +229,7 @@ EXPORT_SYMBOL(blk_queue_max_phys_segments);
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
  *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give as once
+ *    address/length pairs the host adapter can actually give at once
  *    to the device.
  **/
 void blk_queue_max_hw_segments(struct request_queue *q,
@@ -410,7 +410,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
  * @mask:  alignment mask
  *
  * description:
- *    set required memory and length aligment for direct dma transactions.
+ *    set required memory and length alignment for direct dma transactions.
  *    this is used when buiding direct io requests for the queue.
  *
  **/
@@ -426,7 +426,7 @@ EXPORT_SYMBOL(blk_queue_dma_alignment);
  * @mask:  alignment mask
  *
  * description:
- *    update required memory and length aligment for direct dma transactions.
+ *    update required memory and length alignment for direct dma transactions.
  *    If the requested alignment is larger than the current alignment, then
  *    the current queue alignment is updated to the new value, otherwise it
  *    is left alone.  The design of this is to allow multiple objects
diff --git a/block/blk-tag.c b/block/blk-tag.c
index ed5166fbc599..8a99688eb1b1 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -29,7 +29,7 @@ EXPORT_SYMBOL(blk_queue_find_tag);
  * __blk_free_tags - release a given set of tag maintenance info
  * @bqt:	the tag map to free
  *
- * Tries to free the specified @bqt@.  Returns true if it was
+ * Tries to free the specified @bqt.  Returns true if it was
  * actually freed and false if there are still references using it
  */
 static int __blk_free_tags(struct blk_queue_tag *bqt)
@@ -78,7 +78,7 @@ void __blk_queue_free_tags(struct request_queue *q)
  * blk_free_tags - release a given set of tag maintenance info
  * @bqt:	the tag map to free
  *
- * For externally managed @bqt@ frees the map.  Callers of this
+ * For externally managed @bqt frees the map.  Callers of this
  * function must guarantee to have released all the queues that
  * might have been using this tag map.
  */
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(blk_free_tags);
  * @q:  the request queue for the device
  *
  *  Notes:
- *	This is used to disabled tagged queuing to a device, yet leave
+ *	This is used to disable tagged queuing to a device, yet leave
  *	queue in function.
  **/
 void blk_queue_free_tags(struct request_queue *q)
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
  * @rq: the request that has completed
  *
  *  Description:
- *    Typically called when end_that_request_first() returns 0, meaning
+ *    Typically called when end_that_request_first() returns %0, meaning
  *    all transfers have been done for a request. It's important to call
  *    this function before end_that_request_last(), as that will put the
  *    request back on the free list thus corrupting the internal tag list.
diff --git a/block/genhd.c b/block/genhd.c
index e0ce23ac2ece..c114a43052de 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -211,10 +211,11 @@ void unlink_gendisk(struct gendisk *disk)
 
 /**
  * get_gendisk - get partitioning information for a given device
- * @dev: device to get partitioning information for
+ * @devt: device to get partitioning information for
+ * @part: returned partition index
  *
  * This function gets the structure containing partitioning
- * information for the given device @dev.
+ * information for the given device @devt.
  */
 struct gendisk *get_gendisk(dev_t devt, int *part)
 {

From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:50:16 +0200
Subject: [PATCH 026/132] klist: don't iterate over deleted entries

A klist entry is kept on the list till all its current iterations are
finished; however, a new iteration after deletion also iterates over
deleted entries as long as their reference count stays above zero.
This causes problems for cases where there are users which iterate
over the list while synchronized against list manipulations and
natuarally expect already deleted entries to not show up during
iteration.

This patch implements dead flag which gets set on deletion so that
iteration can skip already deleted entries.  The dead flag piggy backs
on the lowest bit of knode->n_klist and only visible to klist
implementation proper.

While at it, drop klist_iter->i_head as it's redundant and doesn't
offer anything in semantics or performance wise as klist_iter->i_klist
is dereferenced on every iteration anyway.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/klist.h |   3 +-
 lib/klist.c           | 102 ++++++++++++++++++++++++++++++------------
 2 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/include/linux/klist.h b/include/linux/klist.h
index 06c338ef7f1b..8ea98db223e5 100644
--- a/include/linux/klist.h
+++ b/include/linux/klist.h
@@ -38,7 +38,7 @@ extern void klist_init(struct klist *k, void (*get)(struct klist_node *),
 		       void (*put)(struct klist_node *));
 
 struct klist_node {
-	struct klist		*n_klist;
+	void			*n_klist;	/* never access directly */
 	struct list_head	n_node;
 	struct kref		n_ref;
 	struct completion	n_removed;
@@ -57,7 +57,6 @@ extern int klist_node_attached(struct klist_node *n);
 
 struct klist_iter {
 	struct klist		*i_klist;
-	struct list_head	*i_head;
 	struct klist_node	*i_cur;
 };
 
diff --git a/lib/klist.c b/lib/klist.c
index cca37f96faa2..bbdd3015c2c7 100644
--- a/lib/klist.c
+++ b/lib/klist.c
@@ -37,6 +37,37 @@
 #include <linux/klist.h>
 #include <linux/module.h>
 
+/*
+ * Use the lowest bit of n_klist to mark deleted nodes and exclude
+ * dead ones from iteration.
+ */
+#define KNODE_DEAD		1LU
+#define KNODE_KLIST_MASK	~KNODE_DEAD
+
+static struct klist *knode_klist(struct klist_node *knode)
+{
+	return (struct klist *)
+		((unsigned long)knode->n_klist & KNODE_KLIST_MASK);
+}
+
+static bool knode_dead(struct klist_node *knode)
+{
+	return (unsigned long)knode->n_klist & KNODE_DEAD;
+}
+
+static void knode_set_klist(struct klist_node *knode, struct klist *klist)
+{
+	knode->n_klist = klist;
+	/* no knode deserves to start its life dead */
+	WARN_ON(knode_dead(knode));
+}
+
+static void knode_kill(struct klist_node *knode)
+{
+	/* and no knode should die twice ever either, see we're very humane */
+	WARN_ON(knode_dead(knode));
+	*(unsigned long *)&knode->n_klist |= KNODE_DEAD;
+}
 
 /**
  * klist_init - Initialize a klist structure.
@@ -79,7 +110,7 @@ static void klist_node_init(struct klist *k, struct klist_node *n)
 	INIT_LIST_HEAD(&n->n_node);
 	init_completion(&n->n_removed);
 	kref_init(&n->n_ref);
-	n->n_klist = k;
+	knode_set_klist(n, k);
 	if (k->get)
 		k->get(n);
 }
@@ -115,7 +146,7 @@ EXPORT_SYMBOL_GPL(klist_add_tail);
  */
 void klist_add_after(struct klist_node *n, struct klist_node *pos)
 {
-	struct klist *k = pos->n_klist;
+	struct klist *k = knode_klist(pos);
 
 	klist_node_init(k, n);
 	spin_lock(&k->k_lock);
@@ -131,7 +162,7 @@ EXPORT_SYMBOL_GPL(klist_add_after);
  */
 void klist_add_before(struct klist_node *n, struct klist_node *pos)
 {
-	struct klist *k = pos->n_klist;
+	struct klist *k = knode_klist(pos);
 
 	klist_node_init(k, n);
 	spin_lock(&k->k_lock);
@@ -144,9 +175,10 @@ static void klist_release(struct kref *kref)
 {
 	struct klist_node *n = container_of(kref, struct klist_node, n_ref);
 
+	WARN_ON(!knode_dead(n));
 	list_del(&n->n_node);
 	complete(&n->n_removed);
-	n->n_klist = NULL;
+	knode_set_klist(n, NULL);
 }
 
 static int klist_dec_and_del(struct klist_node *n)
@@ -154,21 +186,28 @@ static int klist_dec_and_del(struct klist_node *n)
 	return kref_put(&n->n_ref, klist_release);
 }
 
+static void klist_put(struct klist_node *n, bool kill)
+{
+	struct klist *k = knode_klist(n);
+	void (*put)(struct klist_node *) = k->put;
+
+	spin_lock(&k->k_lock);
+	if (kill)
+		knode_kill(n);
+	if (!klist_dec_and_del(n))
+		put = NULL;
+	spin_unlock(&k->k_lock);
+	if (put)
+		put(n);
+}
+
 /**
  * klist_del - Decrement the reference count of node and try to remove.
  * @n: node we're deleting.
  */
 void klist_del(struct klist_node *n)
 {
-	struct klist *k = n->n_klist;
-	void (*put)(struct klist_node *) = k->put;
-
-	spin_lock(&k->k_lock);
-	if (!klist_dec_and_del(n))
-		put = NULL;
-	spin_unlock(&k->k_lock);
-	if (put)
-		put(n);
+	klist_put(n, true);
 }
 EXPORT_SYMBOL_GPL(klist_del);
 
@@ -206,7 +245,6 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i,
 			  struct klist_node *n)
 {
 	i->i_klist = k;
-	i->i_head = &k->k_list;
 	i->i_cur = n;
 	if (n)
 		kref_get(&n->n_ref);
@@ -237,7 +275,7 @@ EXPORT_SYMBOL_GPL(klist_iter_init);
 void klist_iter_exit(struct klist_iter *i)
 {
 	if (i->i_cur) {
-		klist_del(i->i_cur);
+		klist_put(i->i_cur, false);
 		i->i_cur = NULL;
 	}
 }
@@ -258,27 +296,33 @@ static struct klist_node *to_klist_node(struct list_head *n)
  */
 struct klist_node *klist_next(struct klist_iter *i)
 {
-	struct list_head *next;
-	struct klist_node *lnode = i->i_cur;
-	struct klist_node *knode = NULL;
 	void (*put)(struct klist_node *) = i->i_klist->put;
+	struct klist_node *last = i->i_cur;
+	struct klist_node *next;
 
 	spin_lock(&i->i_klist->k_lock);
-	if (lnode) {
-		next = lnode->n_node.next;
-		if (!klist_dec_and_del(lnode))
+
+	if (last) {
+		next = to_klist_node(last->n_node.next);
+		if (!klist_dec_and_del(last))
 			put = NULL;
 	} else
-		next = i->i_head->next;
+		next = to_klist_node(i->i_klist->k_list.next);
 
-	if (next != i->i_head) {
-		knode = to_klist_node(next);
-		kref_get(&knode->n_ref);
+	i->i_cur = NULL;
+	while (next != to_klist_node(&i->i_klist->k_list)) {
+		if (likely(!knode_dead(next))) {
+			kref_get(&next->n_ref);
+			i->i_cur = next;
+			break;
+		}
+		next = to_klist_node(next->n_node.next);
 	}
-	i->i_cur = knode;
+
 	spin_unlock(&i->i_klist->k_lock);
-	if (put && lnode)
-		put(lnode);
-	return knode;
+
+	if (put && last)
+		put(last);
+	return i->i_cur;
 }
 EXPORT_SYMBOL_GPL(klist_next);

From 5a3ceb861663040f9ef0176df4aaa494bba5e352 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:50:19 +0200
Subject: [PATCH 027/132] driver-core: use klist for class device list and
 implement iterator

Iterating over entries using callback usually isn't too fun especially
when the entry being iterated over can't be manipulated freely.  This
patch converts class->p->class_devices to klist and implements class
device iterator so that the users can freely build their own control
structure.  The users are also free to call back into class code
without worrying about locking.

class_for_each_device() and class_find_device() are converted to use
the new iterators, so their users don't have to worry about locking
anymore either.

Note: This depends on klist-dont-iterate-over-deleted-entries patch
because class_intf->add/remove_dev() depends on proper synchronization
with device removal.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/base/base.h    |   2 +-
 drivers/base/class.c   | 136 +++++++++++++++++++++++++++++++----------
 drivers/base/core.c    |   6 +-
 include/linux/device.h |  14 ++++-
 4 files changed, 120 insertions(+), 38 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 31dc0cd84afa..0a5f055dffba 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -54,7 +54,7 @@ struct driver_private {
  */
 struct class_private {
 	struct kset class_subsys;
-	struct list_head class_devices;
+	struct klist class_devices;
 	struct list_head class_interfaces;
 	struct kset class_dirs;
 	struct mutex class_mutex;
diff --git a/drivers/base/class.c b/drivers/base/class.c
index cc5e28c8885c..eb85e4312301 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -135,6 +135,20 @@ static void remove_class_attrs(struct class *cls)
 	}
 }
 
+static void klist_class_dev_get(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_class);
+
+	get_device(dev);
+}
+
+static void klist_class_dev_put(struct klist_node *n)
+{
+	struct device *dev = container_of(n, struct device, knode_class);
+
+	put_device(dev);
+}
+
 int __class_register(struct class *cls, struct lock_class_key *key)
 {
 	struct class_private *cp;
@@ -145,7 +159,7 @@ int __class_register(struct class *cls, struct lock_class_key *key)
 	cp = kzalloc(sizeof(*cp), GFP_KERNEL);
 	if (!cp)
 		return -ENOMEM;
-	INIT_LIST_HEAD(&cp->class_devices);
+	klist_init(&cp->class_devices, klist_class_dev_get, klist_class_dev_put);
 	INIT_LIST_HEAD(&cp->class_interfaces);
 	kset_init(&cp->class_dirs);
 	__mutex_init(&cp->class_mutex, "struct class mutex", key);
@@ -268,6 +282,71 @@ char *make_class_name(const char *name, struct kobject *kobj)
 }
 #endif
 
+/**
+ * class_dev_iter_init - initialize class device iterator
+ * @iter: class iterator to initialize
+ * @class: the class we wanna iterate over
+ * @start: the device to start iterating from, if any
+ * @type: device_type of the devices to iterate over, NULL for all
+ *
+ * Initialize class iterator @iter such that it iterates over devices
+ * of @class.  If @start is set, the list iteration will start there,
+ * otherwise if it is NULL, the iteration starts at the beginning of
+ * the list.
+ */
+void class_dev_iter_init(struct class_dev_iter *iter, struct class *class,
+			 struct device *start, const struct device_type *type)
+{
+	struct klist_node *start_knode = NULL;
+
+	if (start)
+		start_knode = &start->knode_class;
+	klist_iter_init_node(&class->p->class_devices, &iter->ki, start_knode);
+	iter->type = type;
+}
+EXPORT_SYMBOL_GPL(class_dev_iter_init);
+
+/**
+ * class_dev_iter_next - iterate to the next device
+ * @iter: class iterator to proceed
+ *
+ * Proceed @iter to the next device and return it.  Returns NULL if
+ * iteration is complete.
+ *
+ * The returned device is referenced and won't be released till
+ * iterator is proceed to the next device or exited.  The caller is
+ * free to do whatever it wants to do with the device including
+ * calling back into class code.
+ */
+struct device *class_dev_iter_next(struct class_dev_iter *iter)
+{
+	struct klist_node *knode;
+	struct device *dev;
+
+	while (1) {
+		knode = klist_next(&iter->ki);
+		if (!knode)
+			return NULL;
+		dev = container_of(knode, struct device, knode_class);
+		if (!iter->type || iter->type == dev->type)
+			return dev;
+	}
+}
+EXPORT_SYMBOL_GPL(class_dev_iter_next);
+
+/**
+ * class_dev_iter_exit - finish iteration
+ * @iter: class iterator to finish
+ *
+ * Finish an iteration.  Always call this function after iteration is
+ * complete whether the iteration ran till the end or not.
+ */
+void class_dev_iter_exit(struct class_dev_iter *iter)
+{
+	klist_iter_exit(&iter->ki);
+}
+EXPORT_SYMBOL_GPL(class_dev_iter_exit);
+
 /**
  * class_for_each_device - device iterator
  * @class: the class we're iterating
@@ -283,13 +362,13 @@ char *make_class_name(const char *name, struct kobject *kobj)
  * We check the return of @fn each time. If it returns anything
  * other than 0, we break out and return that value.
  *
- * Note, we hold class->class_mutex in this function, so it can not be
- * re-acquired in @fn, otherwise it will self-deadlocking. For
- * example, calls to add or remove class members would be verboten.
+ * @fn is allowed to do anything including calling back into class
+ * code.  There's no locking restriction.
  */
 int class_for_each_device(struct class *class, struct device *start,
 			  void *data, int (*fn)(struct device *, void *))
 {
+	struct class_dev_iter iter;
 	struct device *dev;
 	int error = 0;
 
@@ -301,20 +380,13 @@ int class_for_each_device(struct class *class, struct device *start,
 		return -EINVAL;
 	}
 
-	mutex_lock(&class->p->class_mutex);
-	list_for_each_entry(dev, &class->p->class_devices, node) {
-		if (start) {
-			if (start == dev)
-				start = NULL;
-			continue;
-		}
-		dev = get_device(dev);
+	class_dev_iter_init(&iter, class, start, NULL);
+	while ((dev = class_dev_iter_next(&iter))) {
 		error = fn(dev, data);
-		put_device(dev);
 		if (error)
 			break;
 	}
-	mutex_unlock(&class->p->class_mutex);
+	class_dev_iter_exit(&iter);
 
 	return error;
 }
@@ -337,16 +409,15 @@ EXPORT_SYMBOL_GPL(class_for_each_device);
  *
  * Note, you will need to drop the reference with put_device() after use.
  *
- * We hold class->class_mutex in this function, so it can not be
- * re-acquired in @match, otherwise it will self-deadlocking. For
- * example, calls to add or remove class members would be verboten.
+ * @fn is allowed to do anything including calling back into class
+ * code.  There's no locking restriction.
  */
 struct device *class_find_device(struct class *class, struct device *start,
 				 void *data,
 				 int (*match)(struct device *, void *))
 {
+	struct class_dev_iter iter;
 	struct device *dev;
-	int found = 0;
 
 	if (!class)
 		return NULL;
@@ -356,29 +427,23 @@ struct device *class_find_device(struct class *class, struct device *start,
 		return NULL;
 	}
 
-	mutex_lock(&class->p->class_mutex);
-	list_for_each_entry(dev, &class->p->class_devices, node) {
-		if (start) {
-			if (start == dev)
-				start = NULL;
-			continue;
-		}
-		dev = get_device(dev);
+	class_dev_iter_init(&iter, class, start, NULL);
+	while ((dev = class_dev_iter_next(&iter))) {
 		if (match(dev, data)) {
-			found = 1;
+			get_device(dev);
 			break;
-		} else
-			put_device(dev);
+		}
 	}
-	mutex_unlock(&class->p->class_mutex);
+	class_dev_iter_exit(&iter);
 
-	return found ? dev : NULL;
+	return dev;
 }
 EXPORT_SYMBOL_GPL(class_find_device);
 
 int class_interface_register(struct class_interface *class_intf)
 {
 	struct class *parent;
+	struct class_dev_iter iter;
 	struct device *dev;
 
 	if (!class_intf || !class_intf->class)
@@ -391,8 +456,10 @@ int class_interface_register(struct class_interface *class_intf)
 	mutex_lock(&parent->p->class_mutex);
 	list_add_tail(&class_intf->node, &parent->p->class_interfaces);
 	if (class_intf->add_dev) {
-		list_for_each_entry(dev, &parent->p->class_devices, node)
+		class_dev_iter_init(&iter, parent, NULL, NULL);
+		while ((dev = class_dev_iter_next(&iter)))
 			class_intf->add_dev(dev, class_intf);
+		class_dev_iter_exit(&iter);
 	}
 	mutex_unlock(&parent->p->class_mutex);
 
@@ -402,6 +469,7 @@ int class_interface_register(struct class_interface *class_intf)
 void class_interface_unregister(struct class_interface *class_intf)
 {
 	struct class *parent = class_intf->class;
+	struct class_dev_iter iter;
 	struct device *dev;
 
 	if (!parent)
@@ -410,8 +478,10 @@ void class_interface_unregister(struct class_interface *class_intf)
 	mutex_lock(&parent->p->class_mutex);
 	list_del_init(&class_intf->node);
 	if (class_intf->remove_dev) {
-		list_for_each_entry(dev, &parent->p->class_devices, node)
+		class_dev_iter_init(&iter, parent, NULL, NULL);
+		while ((dev = class_dev_iter_next(&iter)))
 			class_intf->remove_dev(dev, class_intf);
+		class_dev_iter_exit(&iter);
 	}
 	mutex_unlock(&parent->p->class_mutex);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index d021c98605b3..b98cb1416a2d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -536,7 +536,6 @@ void device_initialize(struct device *dev)
 	klist_init(&dev->klist_children, klist_children_get,
 		   klist_children_put);
 	INIT_LIST_HEAD(&dev->dma_pools);
-	INIT_LIST_HEAD(&dev->node);
 	init_MUTEX(&dev->sem);
 	spin_lock_init(&dev->devres_lock);
 	INIT_LIST_HEAD(&dev->devres_head);
@@ -916,7 +915,8 @@ int device_add(struct device *dev)
 	if (dev->class) {
 		mutex_lock(&dev->class->p->class_mutex);
 		/* tie the class to the device */
-		list_add_tail(&dev->node, &dev->class->p->class_devices);
+		klist_add_tail(&dev->knode_class,
+			       &dev->class->p->class_devices);
 
 		/* notify any interfaces that the device is here */
 		list_for_each_entry(class_intf,
@@ -1032,7 +1032,7 @@ void device_del(struct device *dev)
 			if (class_intf->remove_dev)
 				class_intf->remove_dev(dev, class_intf);
 		/* remove the device from the class list */
-		list_del_init(&dev->node);
+		klist_del(&dev->knode_class);
 		mutex_unlock(&dev->class->p->class_mutex);
 	}
 	device_remove_file(dev, &uevent_attr);
diff --git a/include/linux/device.h b/include/linux/device.h
index 4d8372d135df..246937c9cbc7 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -199,6 +199,11 @@ struct class {
 	struct class_private *p;
 };
 
+struct class_dev_iter {
+	struct klist_iter		ki;
+	const struct device_type	*type;
+};
+
 extern struct kobject *sysfs_dev_block_kobj;
 extern struct kobject *sysfs_dev_char_kobj;
 extern int __must_check __class_register(struct class *class,
@@ -213,6 +218,13 @@ extern void class_unregister(struct class *class);
 	__class_register(class, &__key);	\
 })
 
+extern void class_dev_iter_init(struct class_dev_iter *iter,
+				struct class *class,
+				struct device *start,
+				const struct device_type *type);
+extern struct device *class_dev_iter_next(struct class_dev_iter *iter);
+extern void class_dev_iter_exit(struct class_dev_iter *iter);
+
 extern int class_for_each_device(struct class *class, struct device *start,
 				 void *data,
 				 int (*fn)(struct device *dev, void *data));
@@ -396,7 +408,7 @@ struct device {
 	spinlock_t		devres_lock;
 	struct list_head	devres_head;
 
-	struct list_head	node;
+	struct klist_node	knode_class;
 	struct class		*class;
 	dev_t			devt;	/* dev_t, creates the sysfs "dev" */
 	struct attribute_group	**groups;	/* optional groups */

From ac65ece4eee10b03ac29ee925cadc179dc810bab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:30:12 +0900
Subject: [PATCH 028/132] block: fix partition info printouts

Recent block_class iteration updates 5c6f35c5..27f3025 broke partition
info printouts.

* printk_all_partitions(): Partition print out stops when it meets a
  partition hole.  Partition printing inner loop should continue
  instead of exiting on empty partition slot.

* /proc/partitions and /proc/diskstats: If all information can't be
  read in single read(), the information is truncated.  This is
  because find_start() doesn't actually update the counter containing
  the initial seek.  It runs to the end and ends up always reporting
  EOF on the second read.

This patch fixes both problems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index c114a43052de..0be95135c404 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -236,7 +236,7 @@ static int printk_partition(struct device *dev, void *data)
 	int n;
 
 	if (dev->type != &disk_type)
-		goto exit;
+		return 0;
 
 	sgp = dev_to_disk(dev);
 	/*
@@ -244,7 +244,7 @@ static int printk_partition(struct device *dev, void *data)
 	 */
 	if (get_capacity(sgp) == 0 ||
 	    (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
-		goto exit;
+		return 0;
 
 	/*
 	 * Note, unlike /proc/partitions, I am showing the numbers in
@@ -264,15 +264,15 @@ static int printk_partition(struct device *dev, void *data)
 	/* now show the partitions */
 	for (n = 0; n < sgp->minors - 1; ++n) {
 		if (sgp->part[n] == NULL)
-			goto exit;
+			continue;
 		if (sgp->part[n]->nr_sects == 0)
-			goto exit;
+			continue;
 		printk("  %02x%02x %10llu %s\n",
 			sgp->major, n + 1 + sgp->first_minor,
 			(unsigned long long)sgp->part[n]->nr_sects >> 1,
 			disk_name(sgp, n + 1, buf));
 	}
-exit:
+
 	return 0;
 }
 

From 2ac3cee5298a247b2774f3319b28a05f588c3f0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 08:53:37 +0200
Subject: [PATCH 029/132] block: don't grab block_class_lock unnecessarily

block_class_lock protects major_names array and bdev_map and doesn't
have anything to do with block class devices.  Don't grab them while
iterating over block class devices.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 0be95135c404..9eb8b3e212c1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -283,9 +283,7 @@ static int printk_partition(struct device *dev, void *data)
  */
 void __init printk_all_partitions(void)
 {
-	mutex_lock(&block_class_lock);
 	class_for_each_device(&block_class, NULL, NULL, printk_partition);
-	mutex_unlock(&block_class_lock);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -305,17 +303,15 @@ static int find_start(struct device *dev, void *data)
 static void *part_start(struct seq_file *part, loff_t *pos)
 {
 	struct device *dev;
-	loff_t k = *pos;
+	loff_t n = *pos;
 
-	if (!k)
+	if (!n)
 		part->private = (void *)1LU;	/* tell show to print header */
 
-	mutex_lock(&block_class_lock);
-	dev = class_find_device(&block_class, NULL, &k, find_start);
-	if (dev) {
-		put_device(dev);
+	dev = class_find_device(&block_class, NULL, &n, find_start);
+	if (dev)
 		return dev_to_disk(dev);
-	}
+
 	return NULL;
 }
 
@@ -341,7 +337,6 @@ static void *part_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void part_stop(struct seq_file *part, void *v)
 {
-	mutex_unlock(&block_class_lock);
 }
 
 static int show_partition(struct seq_file *part, void *v)
@@ -583,14 +578,12 @@ static struct device_type disk_type = {
 static void *diskstats_start(struct seq_file *part, loff_t *pos)
 {
 	struct device *dev;
-	loff_t k = *pos;
+	loff_t n = *pos;
 
-	mutex_lock(&block_class_lock);
-	dev = class_find_device(&block_class, NULL, &k, find_start);
-	if (dev) {
-		put_device(dev);
+	dev = class_find_device(&block_class, NULL, &n, find_start);
+	if (dev)
 		return dev_to_disk(dev);
-	}
+
 	return NULL;
 }
 
@@ -610,7 +603,6 @@ static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
 
 static void diskstats_stop(struct seq_file *part, void *v)
 {
-	mutex_unlock(&block_class_lock);
 }
 
 static int diskstats_show(struct seq_file *s, void *v)
@@ -729,7 +721,6 @@ dev_t blk_lookup_devt(const char *name, int part)
 	dev_t devt = MKDEV(0, 0);
 	struct find_block find;
 
-	mutex_lock(&block_class_lock);
 	find.name = name;
 	find.part = part;
 	dev = class_find_device(&block_class, NULL, &find, match_id);
@@ -738,7 +729,6 @@ dev_t blk_lookup_devt(const char *name, int part)
 		devt = MKDEV(MAJOR(dev->devt),
 			     MINOR(dev->devt) + part);
 	}
-	mutex_unlock(&block_class_lock);
 
 	return devt;
 }

From def4e38ddda9bef20b69bfa939195c2f79da7979 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 08:57:12 +0200
Subject: [PATCH 030/132] block: use class_dev_iterator instead of
 class_for_each_device()

Recent block_class iteration updates 5c6f35c5..27f3025 converted all
class device iteration to class_for_each_device() and
class_find_device(), which are correct but pain in the ass to use.
This pach converts them to newly introduced class_dev_iterator so that
they can use more natural control structures instead of separate
callbacks and struct to pass parameters to them.

This results in smaller and easier code.

This patch also restores the original behavior of not printing header
in /proc/partitions if there's no partition to print.  This is trivial
but still user-visible behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 252 +++++++++++++++++++-------------------------------
 1 file changed, 97 insertions(+), 155 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 9eb8b3e212c1..8b9a9ff1a842 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -225,57 +225,6 @@ struct gendisk *get_gendisk(dev_t devt, int *part)
 	return  kobj ? dev_to_disk(dev) : NULL;
 }
 
-/*
- * print a partitions - intended for places where the root filesystem can't be
- * mounted and thus to give the victim some idea of what went wrong
- */
-static int printk_partition(struct device *dev, void *data)
-{
-	struct gendisk *sgp;
-	char buf[BDEVNAME_SIZE];
-	int n;
-
-	if (dev->type != &disk_type)
-		return 0;
-
-	sgp = dev_to_disk(dev);
-	/*
-	 * Don't show empty devices or things that have been surpressed
-	 */
-	if (get_capacity(sgp) == 0 ||
-	    (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
-		return 0;
-
-	/*
-	 * Note, unlike /proc/partitions, I am showing the numbers in
-	 * hex - the same format as the root= option takes.
-	 */
-	printk("%02x%02x %10llu %s",
-		sgp->major, sgp->first_minor,
-		(unsigned long long)get_capacity(sgp) >> 1,
-		disk_name(sgp, 0, buf));
-	if (sgp->driverfs_dev != NULL &&
-	    sgp->driverfs_dev->driver != NULL)
-		printk(" driver: %s\n",
-			sgp->driverfs_dev->driver->name);
-	else
-		printk(" (driver?)\n");
-
-	/* now show the partitions */
-	for (n = 0; n < sgp->minors - 1; ++n) {
-		if (sgp->part[n] == NULL)
-			continue;
-		if (sgp->part[n]->nr_sects == 0)
-			continue;
-		printk("  %02x%02x %10llu %s\n",
-			sgp->major, n + 1 + sgp->first_minor,
-			(unsigned long long)sgp->part[n]->nr_sects >> 1,
-			disk_name(sgp, n + 1, buf));
-	}
-
-	return 0;
-}
-
 /*
  * print a full list of all partitions - intended for places where the root
  * filesystem can't be mounted and thus to give the victim some idea of what
@@ -283,60 +232,108 @@ static int printk_partition(struct device *dev, void *data)
  */
 void __init printk_all_partitions(void)
 {
-	class_for_each_device(&block_class, NULL, NULL, printk_partition);
+	struct class_dev_iter iter;
+	struct device *dev;
+
+	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+	while ((dev = class_dev_iter_next(&iter))) {
+		struct gendisk *disk = dev_to_disk(dev);
+		char buf[BDEVNAME_SIZE];
+		int n;
+
+		/*
+		 * Don't show empty devices or things that have been
+		 * surpressed
+		 */
+		if (get_capacity(disk) == 0 ||
+		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
+			continue;
+
+		/*
+		 * Note, unlike /proc/partitions, I am showing the
+		 * numbers in hex - the same format as the root=
+		 * option takes.
+		 */
+		printk("%02x%02x %10llu %s",
+		       disk->major, disk->first_minor,
+		       (unsigned long long)get_capacity(disk) >> 1,
+		       disk_name(disk, 0, buf));
+		if (disk->driverfs_dev != NULL &&
+		    disk->driverfs_dev->driver != NULL)
+			printk(" driver: %s\n",
+			       disk->driverfs_dev->driver->name);
+		else
+			printk(" (driver?)\n");
+
+		/* now show the partitions */
+		for (n = 0; n < disk->minors - 1; ++n) {
+			if (disk->part[n] == NULL)
+				continue;
+			if (disk->part[n]->nr_sects == 0)
+				continue;
+			printk("  %02x%02x %10llu %s\n",
+			       disk->major, n + 1 + disk->first_minor,
+			       (unsigned long long)disk->part[n]->nr_sects >> 1,
+			       disk_name(disk, n + 1, buf));
+		}
+	}
+	class_dev_iter_exit(&iter);
 }
 
 #ifdef CONFIG_PROC_FS
 /* iterator */
-static int find_start(struct device *dev, void *data)
+static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
 {
-	loff_t *k = data;
+	loff_t skip = *pos;
+	struct class_dev_iter *iter;
+	struct device *dev;
 
-	if (dev->type != &disk_type)
-		return 0;
-	if (!*k)
-		return 1;
-	(*k)--;
-	return 0;
+	iter = kmalloc(GFP_KERNEL, sizeof(*iter));
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
+
+	seqf->private = iter;
+	class_dev_iter_init(iter, &block_class, NULL, &disk_type);
+	do {
+		dev = class_dev_iter_next(iter);
+		if (!dev)
+			return NULL;
+	} while (skip--);
+
+	return dev_to_disk(dev);
 }
 
-static void *part_start(struct seq_file *part, loff_t *pos)
+static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos)
 {
 	struct device *dev;
-	loff_t n = *pos;
 
-	if (!n)
-		part->private = (void *)1LU;	/* tell show to print header */
-
-	dev = class_find_device(&block_class, NULL, &n, find_start);
+	(*pos)++;
+	dev = class_dev_iter_next(seqf->private);
 	if (dev)
 		return dev_to_disk(dev);
 
 	return NULL;
 }
 
-static int find_next(struct device *dev, void *data)
+static void disk_seqf_stop(struct seq_file *seqf, void *v)
 {
-	if (dev->type == &disk_type)
-		return 1;
-	return 0;
-}
+	struct class_dev_iter *iter = seqf->private;
 
-static void *part_next(struct seq_file *part, void *v, loff_t *pos)
-{
-	struct gendisk *gp = v;
-	struct device *dev;
-	++*pos;
-	dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
-	if (dev) {
-		put_device(dev);
-		return dev_to_disk(dev);
+	/* stop is called even after start failed :-( */
+	if (iter) {
+		class_dev_iter_exit(iter);
+		kfree(iter);
 	}
-	return NULL;
 }
 
-static void part_stop(struct seq_file *part, void *v)
+static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 {
+	static void *p;
+
+	p = disk_seqf_start(seqf, pos);
+	if (!IS_ERR(p) && p)
+		seq_puts(seqf, "major minor  #blocks  name\n\n");
+	return p;
 }
 
 static int show_partition(struct seq_file *part, void *v)
@@ -383,9 +380,9 @@ static int show_partition(struct seq_file *part, void *v)
 }
 
 const struct seq_operations partitions_op = {
-	.start	= part_start,
-	.next	= part_next,
-	.stop	= part_stop,
+	.start	= show_partition_start,
+	.next	= disk_seqf_next,
+	.stop	= disk_seqf_stop,
 	.show	= show_partition
 };
 #endif
@@ -567,44 +564,6 @@ static struct device_type disk_type = {
 };
 
 #ifdef CONFIG_PROC_FS
-/*
- * aggregate disk stat collector.  Uses the same stats that the sysfs
- * entries do, above, but makes them available through one seq_file.
- *
- * The output looks suspiciously like /proc/partitions with a bunch of
- * extra fields.
- */
-
-static void *diskstats_start(struct seq_file *part, loff_t *pos)
-{
-	struct device *dev;
-	loff_t n = *pos;
-
-	dev = class_find_device(&block_class, NULL, &n, find_start);
-	if (dev)
-		return dev_to_disk(dev);
-
-	return NULL;
-}
-
-static void *diskstats_next(struct seq_file *part, void *v, loff_t *pos)
-{
-	struct gendisk *gp = v;
-	struct device *dev;
-
-	++*pos;
-	dev = class_find_device(&block_class, &gp->dev, NULL, find_next);
-	if (dev) {
-		put_device(dev);
-		return dev_to_disk(dev);
-	}
-	return NULL;
-}
-
-static void diskstats_stop(struct seq_file *part, void *v)
-{
-}
-
 static int diskstats_show(struct seq_file *s, void *v)
 {
 	struct gendisk *gp = v;
@@ -666,9 +625,9 @@ static int diskstats_show(struct seq_file *s, void *v)
 }
 
 const struct seq_operations diskstats_op = {
-	.start	= diskstats_start,
-	.next	= diskstats_next,
-	.stop	= diskstats_stop,
+	.start	= disk_seqf_start,
+	.next	= disk_seqf_next,
+	.stop	= disk_seqf_stop,
 	.show	= diskstats_show
 };
 #endif /* CONFIG_PROC_FS */
@@ -696,40 +655,23 @@ void genhd_media_change_notify(struct gendisk *disk)
 EXPORT_SYMBOL_GPL(genhd_media_change_notify);
 #endif  /*  0  */
 
-struct find_block {
-	const char *name;
-	int part;
-};
-
-static int match_id(struct device *dev, void *data)
-{
-	struct find_block *find = data;
-
-	if (dev->type != &disk_type)
-		return 0;
-	if (strcmp(dev->bus_id, find->name) == 0) {
-		struct gendisk *disk = dev_to_disk(dev);
-		if (find->part < disk->minors)
-			return 1;
-	}
-	return 0;
-}
-
 dev_t blk_lookup_devt(const char *name, int part)
 {
-	struct device *dev;
 	dev_t devt = MKDEV(0, 0);
-	struct find_block find;
+	struct class_dev_iter iter;
+	struct device *dev;
 
-	find.name = name;
-	find.part = part;
-	dev = class_find_device(&block_class, NULL, &find, match_id);
-	if (dev) {
-		put_device(dev);
-		devt = MKDEV(MAJOR(dev->devt),
-			     MINOR(dev->devt) + part);
+	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+	while ((dev = class_dev_iter_next(&iter))) {
+		struct gendisk *disk = dev_to_disk(dev);
+
+		if (!strcmp(dev->bus_id, name) && part < disk->minors) {
+			devt = MKDEV(MAJOR(dev->devt),
+				     MINOR(dev->devt) + part);
+			break;
+		}
 	}
-
+	class_dev_iter_exit(&iter);
 	return devt;
 }
 EXPORT_SYMBOL(blk_lookup_devt);

From ec2cdedf798385a9397ac50dd0405dd658f8529c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:30:15 +0900
Subject: [PATCH 031/132] block: allow deleting zero length partition

delete_partition() was noop for zero length partition.  As the
addition code allows creating zero lenght partition and deletion is
assumed to always succeed, this causes memory leak for zero length
partitions.  Allow zero length partitions to end their meaningless
lives.

While at it, allow deleting zero lenght partition via
BLKPG_DEL_PARTITION ioctl too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c         | 2 --
 fs/partitions/check.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 375c57922b00..c722de0ef2ee 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -68,8 +68,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 		case BLKPG_DEL_PARTITION:
 			if (!disk->part[part-1])
 				return -ENXIO;
-			if (disk->part[part - 1]->nr_sects == 0)
-				return -ENXIO;
 			bdevp = bdget_disk(disk, part);
 			if (!bdevp)
 				return -ENOMEM;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ecc3330972e5..68f3e41ae66f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -325,8 +325,6 @@ void delete_partition(struct gendisk *disk, int part)
 
 	if (!p)
 		return;
-	if (!p->nr_sects)
-		return;
 	disk->part[part-1] = NULL;
 	p->start_sect = 0;
 	p->nr_sects = 0;

From 88e341261ca4d39eec21b212961c77eff51105f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:30:16 +0900
Subject: [PATCH 032/132] block: update add_partition() error handling

d805dda4 tried to fix error case handling in add_partition() but had a
few problems.

* disk->part[] entry is set early and left dangling if operation
  fails.

* Once device initialized, the last put_device() is responsible for
  freeing all the resources.  The failure path freed part_stats and p
  regardless of put_device() causing double free.

* holders subdir holds reference to the disk device, so failure path
  should remove it to release resources properly which was missing.

This patch fixes the above problems and while at it move partition
slot busy check into add_partition() for completeness and inlines
holders subdirectory creation.  Using separate function for it just
obfuscates the code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Abdel Benamrouche <draconux@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c         |  7 ++-----
 fs/partitions/check.c | 42 ++++++++++++++++++++++--------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index c722de0ef2ee..eb046aeede8a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -43,12 +43,9 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				    || pstart < 0 || plength < 0)
 					return -EINVAL;
 			}
-			/* partition number in use? */
+
 			mutex_lock(&bdev->bd_mutex);
-			if (disk->part[part - 1]) {
-				mutex_unlock(&bdev->bd_mutex);
-				return -EBUSY;
-			}
+
 			/* overlap? */
 			for (i = 0; i < disk->minors - 1; i++) {
 				struct hd_struct *s = disk->part[i];
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 68f3e41ae66f..16f98d824608 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -300,15 +300,6 @@ struct device_type part_type = {
 	.release	= part_release,
 };
 
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
-{
-	struct kobject *k;
-
-	k = kobject_get(&p->dev.kobj);
-	p->holder_dir = kobject_create_and_add("holders", k);
-	kobject_put(k);
-}
-
 static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
 	struct kobject *k;
@@ -347,13 +338,16 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	struct hd_struct *p;
 	int err;
 
+	if (disk->part[part - 1])
+		return -EBUSY;
+
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
-		goto out0;
+		goto out_free;
 	}
 	p->start_sect = start;
 	p->nr_sects = len;
@@ -372,35 +366,43 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
-	disk->part[part-1] = p;
 
 	/* delay uevent until 'holders' subdir is created */
 	p->dev.uevent_suppress = 1;
 	err = device_add(&p->dev);
 	if (err)
-		goto out1;
-	partition_sysfs_add_subdir(p);
+		goto out_put;
+
+	err = -ENOMEM;
+	p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj);
+	if (!p->holder_dir)
+		goto out_del;
+
 	p->dev.uevent_suppress = 0;
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(&p->dev, &dev_attr_whole_disk);
 		if (err)
-			goto out2;
+			goto out_del;
 	}
 
+	/* everything is up and running, commence */
+	disk->part[part - 1] = p;
+
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
 		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
 
 	return 0;
 
-out2:
-	device_del(&p->dev);
-out1:
-	put_device(&p->dev);
-	free_part_stats(p);
-out0:
+out_free:
 	kfree(p);
 	return err;
+out_del:
+	kobject_put(p->holder_dir);
+	device_del(&p->dev);
+out_put:
+	put_device(&p->dev);
+	return err;
 }
 
 /* Not exported, helper to add_disk(). */

From 310a2c1012934f590192377f65940cad4aa72b15 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:17 +0900
Subject: [PATCH 033/132] block: misc updates

This patch makes the following misc updates in preparation for
disk->part dereference fix and extended block devt support.

* implment part_to_disk()

* fix comment about gendisk->part indexing

* rename get_part() to disk_map_sector()

* don't use n which is always zero while printing disk information in
  diskstats_show()

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c           |  7 ++++---
 block/blk-merge.c          |  4 ++--
 block/genhd.c              |  4 ++--
 drivers/block/aoe/aoecmd.c |  2 +-
 include/linux/genhd.h      | 13 ++++++++++---
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 86d22e7d65c5..a0dc2e72fcbb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -60,7 +60,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 
-	part = get_part(rq->rq_disk, rq->sector);
+	part = disk_map_sector(rq->rq_disk, rq->sector);
 	if (!new_io)
 		__all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
 	else {
@@ -1557,7 +1557,8 @@ static int __end_that_request_first(struct request *req, int error,
 	}
 
 	if (blk_fs_request(req) && req->rq_disk) {
-		struct hd_struct *part = get_part(req->rq_disk, req->sector);
+		struct hd_struct *part =
+			disk_map_sector(req->rq_disk, req->sector);
 		const int rw = rq_data_dir(req);
 
 		all_stat_add(req->rq_disk, part, sectors[rw],
@@ -1745,7 +1746,7 @@ static void end_that_request_last(struct request *req, int error)
 	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
-		struct hd_struct *part = get_part(disk, req->sector);
+		struct hd_struct *part = disk_map_sector(disk, req->sector);
 
 		__all_stat_inc(disk, part, ios[rw], req->sector);
 		__all_stat_add(disk, part, ticks[rw], duration, req->sector);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d81d91419ff5..9b17da698d7c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -387,8 +387,8 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	elv_merge_requests(q, req, next);
 
 	if (req->rq_disk) {
-		struct hd_struct *part
-			= get_part(req->rq_disk, req->sector);
+		struct hd_struct *part =
+			disk_map_sector(req->rq_disk, req->sector);
 		disk_round_stats(req->rq_disk);
 		req->rq_disk->in_flight--;
 		if (part) {
diff --git a/block/genhd.c b/block/genhd.c
index 8b9a9ff1a842..11038fbc75ed 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -568,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 {
 	struct gendisk *gp = v;
 	char buf[BDEVNAME_SIZE];
-	int n = 0;
+	int n;
 
 	/*
 	if (&gp->dev.kobj.entry == block_class.devices.next)
@@ -582,7 +582,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 	disk_round_stats(gp);
 	preempt_enable();
 	seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
-		gp->major, n + gp->first_minor, disk_name(gp, n, buf),
+		gp->major, gp->first_minor, disk_name(gp, 0, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
 		jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 2f1746295d06..885d1409521f 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -757,7 +757,7 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 	const int rw = bio_data_dir(bio);
 	struct hd_struct *part;
 
-	part = get_part(disk, sector);
+	part = disk_map_sector(disk, sector);
 	all_stat_inc(disk, part, ios[rw], sector);
 	all_stat_add(disk, part, ticks[rw], duration, sector);
 	all_stat_add(disk, part, sectors[rw], n_sect, sector);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index be4f5e5bfe06..c64e659c9843 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,7 +116,7 @@ struct gendisk {
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
 	char disk_name[32];		/* name of major driver */
-	struct hd_struct **part;	/* [indexed by minor] */
+	struct hd_struct **part;	/* [indexed by minor - 1] */
 	struct block_device_operations *fops;
 	struct request_queue *queue;
 	void *private_data;
@@ -145,14 +145,21 @@ struct gendisk {
 #endif
 };
 
+static inline struct gendisk *part_to_disk(struct hd_struct *part)
+{
+	if (likely(part))
+		return dev_to_disk((part)->dev.parent);
+	return NULL;
+}
+
 /* 
  * Macros to operate on percpu disk statistics:
  *
  * The __ variants should only be called in critical sections. The full
  * variants disable/enable preemption.
  */
-static inline struct hd_struct *get_part(struct gendisk *gendiskp,
-					 sector_t sector)
+static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp,
+						sector_t sector)
 {
 	struct hd_struct *part;
 	int i;

From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:01:09 +0200
Subject: [PATCH 034/132] block: make variable and argument names more
 consistent

In hd_struct, @partno is used to denote partition number and a number
of other places use @part to denote hd_struct.  Functions use @part
and @index instead.  This causes confusion and makes it difficult to
use consistent variable names for hd_struct.  Always use @partno if a
variable represents partition number.

Also, print out functions use @f or @part for seq_file argument.  Use
@seqf uniformly instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 54 ++++++++++++++++++++-----------------------
 block/ioctl.c         | 15 ++++++------
 fs/block_dev.c        |  8 +++----
 fs/partitions/check.c | 33 +++++++++++++-------------
 include/linux/genhd.h | 12 +++++-----
 5 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 11038fbc75ed..dc9ad4c171e2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -43,14 +43,14 @@ static inline int major_to_index(int major)
 }
 
 #ifdef CONFIG_PROC_FS
-void blkdev_show(struct seq_file *f, off_t offset)
+void blkdev_show(struct seq_file *seqf, off_t offset)
 {
 	struct blk_major_name *dp;
 
 	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&block_class_lock);
 		for (dp = major_names[offset]; dp; dp = dp->next)
-			seq_printf(f, "%3d %s\n", dp->major, dp->name);
+			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
 		mutex_unlock(&block_class_lock);
 	}
 }
@@ -157,7 +157,7 @@ void blk_unregister_region(dev_t devt, unsigned long range)
 
 EXPORT_SYMBOL(blk_unregister_region);
 
-static struct kobject *exact_match(dev_t devt, int *part, void *data)
+static struct kobject *exact_match(dev_t devt, int *partno, void *data)
 {
 	struct gendisk *p = data;
 
@@ -217,9 +217,9 @@ void unlink_gendisk(struct gendisk *disk)
  * This function gets the structure containing partitioning
  * information for the given device @devt.
  */
-struct gendisk *get_gendisk(dev_t devt, int *part)
+struct gendisk *get_gendisk(dev_t devt, int *partno)
 {
-	struct kobject *kobj = kobj_lookup(bdev_map, devt, part);
+	struct kobject *kobj = kobj_lookup(bdev_map, devt, partno);
 	struct device *dev = kobj_to_dev(kobj);
 
 	return  kobj ? dev_to_disk(dev) : NULL;
@@ -336,23 +336,12 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 	return p;
 }
 
-static int show_partition(struct seq_file *part, void *v)
+static int show_partition(struct seq_file *seqf, void *v)
 {
 	struct gendisk *sgp = v;
 	int n;
 	char buf[BDEVNAME_SIZE];
 
-	/*
-	 * Print header if start told us to do.  This is to preserve
-	 * the original behavior of not printing header if no
-	 * partition exists.  This hackery will be removed later with
-	 * class iteration clean up.
-	 */
-	if (part->private) {
-		seq_puts(part, "major minor  #blocks  name\n\n");
-		part->private = NULL;
-	}
-
 	/* Don't show non-partitionable removeable devices or empty devices */
 	if (!get_capacity(sgp) ||
 			(sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
@@ -361,7 +350,7 @@ static int show_partition(struct seq_file *part, void *v)
 		return 0;
 
 	/* show the full disk and all non-0 size partitions of it */
-	seq_printf(part, "%4d  %4d %10llu %s\n",
+	seq_printf(seqf, "%4d  %4d %10llu %s\n",
 		sgp->major, sgp->first_minor,
 		(unsigned long long)get_capacity(sgp) >> 1,
 		disk_name(sgp, 0, buf));
@@ -370,7 +359,7 @@ static int show_partition(struct seq_file *part, void *v)
 			continue;
 		if (sgp->part[n]->nr_sects == 0)
 			continue;
-		seq_printf(part, "%4d  %4d %10llu %s\n",
+		seq_printf(seqf, "%4d  %4d %10llu %s\n",
 			sgp->major, n + 1 + sgp->first_minor,
 			(unsigned long long)sgp->part[n]->nr_sects >> 1 ,
 			disk_name(sgp, n + 1, buf));
@@ -388,7 +377,7 @@ const struct seq_operations partitions_op = {
 #endif
 
 
-static struct kobject *base_probe(dev_t devt, int *part, void *data)
+static struct kobject *base_probe(dev_t devt, int *partno, void *data)
 {
 	if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
 		/* Make old-style 2.4 aliases work */
@@ -564,7 +553,14 @@ static struct device_type disk_type = {
 };
 
 #ifdef CONFIG_PROC_FS
-static int diskstats_show(struct seq_file *s, void *v)
+/*
+ * aggregate disk stat collector.  Uses the same stats that the sysfs
+ * entries do, above, but makes them available through one seq_file.
+ *
+ * The output looks suspiciously like /proc/partitions with a bunch of
+ * extra fields.
+ */
+static int diskstats_show(struct seq_file *seqf, void *v)
 {
 	struct gendisk *gp = v;
 	char buf[BDEVNAME_SIZE];
@@ -572,7 +568,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 
 	/*
 	if (&gp->dev.kobj.entry == block_class.devices.next)
-		seq_puts(s,	"major minor name"
+		seq_puts(seqf,	"major minor name"
 				"     rio rmerge rsect ruse wio wmerge "
 				"wsect wuse running use aveq"
 				"\n\n");
@@ -581,7 +577,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 	preempt_disable();
 	disk_round_stats(gp);
 	preempt_enable();
-	seq_printf(s, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
+	seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		gp->major, gp->first_minor, disk_name(gp, 0, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
@@ -603,7 +599,7 @@ static int diskstats_show(struct seq_file *s, void *v)
 		preempt_disable();
 		part_round_stats(hd);
 		preempt_enable();
-		seq_printf(s, "%4d %4d %s %lu %lu %llu "
+		seq_printf(seqf, "%4d %4d %s %lu %lu %llu "
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   gp->major, n + gp->first_minor + 1,
 			   disk_name(gp, n + 1, buf),
@@ -655,7 +651,7 @@ void genhd_media_change_notify(struct gendisk *disk)
 EXPORT_SYMBOL_GPL(genhd_media_change_notify);
 #endif  /*  0  */
 
-dev_t blk_lookup_devt(const char *name, int part)
+dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
 	struct class_dev_iter iter;
@@ -665,9 +661,9 @@ dev_t blk_lookup_devt(const char *name, int part)
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
 
-		if (!strcmp(dev->bus_id, name) && part < disk->minors) {
+		if (!strcmp(dev->bus_id, name) && partno < disk->minors) {
 			devt = MKDEV(MAJOR(dev->devt),
-				     MINOR(dev->devt) + part);
+				     MINOR(dev->devt) + partno);
 			break;
 		}
 	}
@@ -777,10 +773,10 @@ int bdev_read_only(struct block_device *bdev)
 
 EXPORT_SYMBOL(bdev_read_only);
 
-int invalidate_partition(struct gendisk *disk, int index)
+int invalidate_partition(struct gendisk *disk, int partno)
 {
 	int res = 0;
-	struct block_device *bdev = bdget_disk(disk, index);
+	struct block_device *bdev = bdget_disk(disk, partno);
 	if (bdev) {
 		fsync_bdev(bdev);
 		res = __invalidate_device(bdev);
diff --git a/block/ioctl.c b/block/ioctl.c
index eb046aeede8a..d77f5e280a6e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -15,7 +15,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct blkpg_ioctl_arg a;
 	struct blkpg_partition p;
 	long long start, length;
-	int part;
+	int partno;
 	int i;
 	int err;
 
@@ -28,8 +28,8 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	disk = bdev->bd_disk;
 	if (bdev != bdev->bd_contains)
 		return -EINVAL;
-	part = p.pno;
-	if (part <= 0 || part >= disk->minors)
+	partno = p.pno;
+	if (partno <= 0 || partno >= disk->minors)
 		return -EINVAL;
 	switch (a.op) {
 		case BLKPG_ADD_PARTITION:
@@ -59,13 +59,14 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 				}
 			}
 			/* all seems OK */
-			err = add_partition(disk, part, start, length, ADDPART_FLAG_NONE);
+			err = add_partition(disk, partno, start, length,
+					    ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
 			return err;
 		case BLKPG_DEL_PARTITION:
-			if (!disk->part[part-1])
+			if (!disk->part[partno - 1])
 				return -ENXIO;
-			bdevp = bdget_disk(disk, part);
+			bdevp = bdget_disk(disk, partno);
 			if (!bdevp)
 				return -ENOMEM;
 			mutex_lock(&bdevp->bd_mutex);
@@ -79,7 +80,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			invalidate_bdev(bdevp);
 
 			mutex_lock_nested(&bdev->bd_mutex, 1);
-			delete_partition(disk, part);
+			delete_partition(disk, partno);
 			mutex_unlock(&bdev->bd_mutex);
 			mutex_unlock(&bdevp->bd_mutex);
 			bdput(bdevp);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index aff54219e049..de0776cd7215 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -930,7 +930,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	struct module *owner = NULL;
 	struct gendisk *disk;
 	int ret;
-	int part;
+	int partno;
 	int perm = 0;
 
 	if (file->f_mode & FMODE_READ)
@@ -949,7 +949,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 	lock_kernel();
-	disk = get_gendisk(bdev->bd_dev, &part);
+	disk = get_gendisk(bdev->bd_dev, &partno);
 	if (!disk) {
 		unlock_kernel();
 		bdput(bdev);
@@ -961,7 +961,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
 		bdev->bd_contains = bdev;
-		if (!part) {
+		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev->bd_inode, file);
@@ -989,7 +989,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			p = disk->part[part - 1];
+			p = disk->part[partno - 1];
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 16f98d824608..b86aab1b0df6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,22 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
  * a pointer to that same buffer (for convenience).
  */
 
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
 {
-	if (!part)
+	if (!partno)
 		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
 	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
-		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
 	else
-		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
 
 	return buf;
 }
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
-	return disk_name(bdev->bd_disk, part, buf);
+	int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+	return disk_name(bdev->bd_disk, partno, buf);
 }
 
 EXPORT_SYMBOL(bdevname);
@@ -310,13 +310,13 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	kobject_put(k);
 }
 
-void delete_partition(struct gendisk *disk, int part)
+void delete_partition(struct gendisk *disk, int partno)
 {
-	struct hd_struct *p = disk->part[part-1];
+	struct hd_struct *p = disk->part[partno - 1];
 
 	if (!p)
 		return;
-	disk->part[part-1] = NULL;
+	disk->part[partno - 1] = NULL;
 	p->start_sect = 0;
 	p->nr_sects = 0;
 	part_stat_set_all(p, 0);
@@ -333,12 +333,13 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+		  sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	int err;
 
-	if (disk->part[part - 1])
+	if (disk->part[partno - 1])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -351,18 +352,18 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	}
 	p->start_sect = start;
 	p->nr_sects = len;
-	p->partno = part;
+	p->partno = partno;
 	p->policy = disk->policy;
 
 	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
 		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-		"%sp%d", disk->dev.bus_id, part);
+		"%sp%d", disk->dev.bus_id, partno);
 	else
 		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-			 "%s%d", disk->dev.bus_id, part);
+			 "%s%d", disk->dev.bus_id, partno);
 
 	device_initialize(&p->dev);
-	p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
+	p->dev.devt = MKDEV(disk->major, disk->first_minor + partno);
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
@@ -386,7 +387,7 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	}
 
 	/* everything is up and running, commence */
-	disk->part[part - 1] = p;
+	disk->part[partno - 1] = p;
 
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c64e659c9843..d1723c0a8600 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -365,7 +365,7 @@ extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
-extern struct gendisk *get_gendisk(dev_t dev, int *part);
+extern struct gendisk *get_gendisk(dev_t dev, int *partno);
 
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
@@ -534,8 +534,8 @@ struct unixware_disklabel {
 #define ADDPART_FLAG_RAID	1
 #define ADDPART_FLAG_WHOLEDISK	2
 
-extern dev_t blk_lookup_devt(const char *name, int part);
-extern char *disk_name (struct gendisk *hd, int part, char *buf);
+extern dev_t blk_lookup_devt(const char *name, int partno);
+extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
@@ -553,16 +553,16 @@ extern void blk_register_region(dev_t devt, unsigned long range,
 			void *data);
 extern void blk_unregister_region(dev_t devt, unsigned long range);
 
-static inline struct block_device *bdget_disk(struct gendisk *disk, int index)
+static inline struct block_device *bdget_disk(struct gendisk *disk, int partno)
 {
-	return bdget(MKDEV(disk->major, disk->first_minor) + index);
+	return bdget(MKDEV(disk->major, disk->first_minor) + partno);
 }
 
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }
 
-static inline dev_t blk_lookup_devt(const char *name, int part)
+static inline dev_t blk_lookup_devt(const char *name, int partno)
 {
 	dev_t devt = MKDEV(0, 0);
 	return devt;

From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:01:48 +0200
Subject: [PATCH 035/132] block: don't depend on consecutive minor space

* Implement disk_devt() and part_devt() and use them to directly
  access devt instead of computing it from ->major and ->first_minor.

  Note that all references to ->major and ->first_minor outside of
  block layer is used to determine devt of the disk (the part0) and as
  ->major and ->first_minor will continue to represent devt for the
  disk, converting these users aren't strictly necessary.  However,
  convert them for consistency.

* Implement disk_max_parts() to avoid directly deferencing
  genhd->minors.

* Update bdget_disk() such that it doesn't assume consecutive minor
  space.

* Move devt computation from register_disk() to add_disk() and make it
  the only one (all other usages use the initially determined value).

These changes clean up the code and will help disk->part dereference
fix and extended block device numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c                       | 107 ++++++++++++++++++++--------
 block/ioctl.c                       |   6 +-
 drivers/block/pktcdvd.c             |   2 +-
 drivers/block/ps3disk.c             |   2 +-
 drivers/char/random.c               |   6 +-
 drivers/md/dm-ioctl.c               |   4 +-
 drivers/md/dm-stripe.c              |   4 +-
 drivers/md/dm.c                     |   7 +-
 drivers/memstick/core/mspro_block.c |   2 +-
 drivers/mmc/card/block.c            |   2 +-
 drivers/s390/block/dasd_proc.c      |   3 +-
 drivers/s390/block/dcssblk.c        |   4 +-
 drivers/scsi/sr.c                   |   2 +-
 fs/block_dev.c                      |   2 +-
 fs/partitions/check.c               |  19 ++---
 include/linux/genhd.h               |  27 +++++--
 16 files changed, 132 insertions(+), 67 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index dc9ad4c171e2..fa32d09fda24 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -186,13 +186,14 @@ void add_disk(struct gendisk *disk)
 	int retval;
 
 	disk->flags |= GENHD_FL_UP;
-	blk_register_region(MKDEV(disk->major, disk->first_minor),
-			    disk->minors, NULL, exact_match, exact_lock, disk);
+	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+	blk_register_region(disk_devt(disk), disk->minors, NULL,
+			    exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
 
 	bdi = &disk->queue->backing_dev_info;
-	bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor));
+	bdi_register_dev(bdi, disk_devt(disk));
 	retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi");
 	WARN_ON(retval);
 }
@@ -205,8 +206,7 @@ void unlink_gendisk(struct gendisk *disk)
 	sysfs_remove_link(&disk->dev.kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
-	blk_unregister_region(MKDEV(disk->major, disk->first_minor),
-			      disk->minors);
+	blk_unregister_region(disk_devt(disk), disk->minors);
 }
 
 /**
@@ -225,6 +225,38 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 	return  kobj ? dev_to_disk(dev) : NULL;
 }
 
+/**
+ * bdget_disk - do bdget() by gendisk and partition number
+ * @disk: gendisk of interest
+ * @partno: partition number
+ *
+ * Find partition @partno from @disk, do bdget() on it.
+ *
+ * CONTEXT:
+ * Don't care.
+ *
+ * RETURNS:
+ * Resulting block_device on success, NULL on failure.
+ */
+extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
+{
+	dev_t devt = MKDEV(0, 0);
+
+	if (partno == 0)
+		devt = disk_devt(disk);
+	else {
+		struct hd_struct *part = disk->part[partno - 1];
+
+		if (part && part->nr_sects)
+			devt = part_devt(part);
+	}
+
+	if (likely(devt != MKDEV(0, 0)))
+		return bdget(devt);
+	return NULL;
+}
+EXPORT_SYMBOL(bdget_disk);
+
 /*
  * print a full list of all partitions - intended for places where the root
  * filesystem can't be mounted and thus to give the victim some idea of what
@@ -255,7 +287,7 @@ void __init printk_all_partitions(void)
 		 * option takes.
 		 */
 		printk("%02x%02x %10llu %s",
-		       disk->major, disk->first_minor,
+		       MAJOR(disk_devt(disk)), MINOR(disk_devt(disk)),
 		       (unsigned long long)get_capacity(disk) >> 1,
 		       disk_name(disk, 0, buf));
 		if (disk->driverfs_dev != NULL &&
@@ -266,15 +298,15 @@ void __init printk_all_partitions(void)
 			printk(" (driver?)\n");
 
 		/* now show the partitions */
-		for (n = 0; n < disk->minors - 1; ++n) {
-			if (disk->part[n] == NULL)
-				continue;
-			if (disk->part[n]->nr_sects == 0)
+		for (n = 0; n < disk_max_parts(disk); ++n) {
+			struct hd_struct *part = disk->part[n];
+
+			if (!part || !part->nr_sects)
 				continue;
 			printk("  %02x%02x %10llu %s\n",
-			       disk->major, n + 1 + disk->first_minor,
-			       (unsigned long long)disk->part[n]->nr_sects >> 1,
-			       disk_name(disk, n + 1, buf));
+			       MAJOR(part_devt(part)), MINOR(part_devt(part)),
+			       (unsigned long long)part->nr_sects >> 1,
+			       disk_name(disk, part->partno, buf));
 		}
 	}
 	class_dev_iter_exit(&iter);
@@ -343,26 +375,27 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) ||
-			(sgp->minors == 1 && (sgp->flags & GENHD_FL_REMOVABLE)))
+	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 		return 0;
 
 	/* show the full disk and all non-0 size partitions of it */
 	seq_printf(seqf, "%4d  %4d %10llu %s\n",
-		sgp->major, sgp->first_minor,
+		MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
 		(unsigned long long)get_capacity(sgp) >> 1,
 		disk_name(sgp, 0, buf));
-	for (n = 0; n < sgp->minors - 1; n++) {
-		if (!sgp->part[n])
+	for (n = 0; n < disk_max_parts(sgp); n++) {
+		struct hd_struct *part = sgp->part[n];
+		if (!part)
 			continue;
-		if (sgp->part[n]->nr_sects == 0)
+		if (part->nr_sects == 0)
 			continue;
 		seq_printf(seqf, "%4d  %4d %10llu %s\n",
-			sgp->major, n + 1 + sgp->first_minor,
-			(unsigned long long)sgp->part[n]->nr_sects >> 1 ,
-			disk_name(sgp, n + 1, buf));
+			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
+			   (unsigned long long)part->nr_sects >> 1,
+			   disk_name(sgp, part->partno, buf));
 	}
 
 	return 0;
@@ -578,7 +611,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	disk_round_stats(gp);
 	preempt_enable();
 	seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
-		gp->major, gp->first_minor, disk_name(gp, 0, buf),
+		MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)),
+		disk_name(gp, 0, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
 		(unsigned long long)disk_stat_read(gp, sectors[0]),
 		jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
@@ -590,7 +624,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
 
 	/* now show all non-0 size partitions of it */
-	for (n = 0; n < gp->minors - 1; n++) {
+	for (n = 0; n < disk_max_parts(gp); n++) {
 		struct hd_struct *hd = gp->part[n];
 
 		if (!hd || !hd->nr_sects)
@@ -601,8 +635,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		preempt_enable();
 		seq_printf(seqf, "%4d %4d %s %lu %lu %llu "
 			   "%u %lu %lu %llu %u %u %u %u\n",
-			   gp->major, n + gp->first_minor + 1,
-			   disk_name(gp, n + 1, buf),
+			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
+			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[0]),
 			   part_stat_read(hd, merges[0]),
 			   (unsigned long long)part_stat_read(hd, sectors[0]),
@@ -661,11 +695,22 @@ dev_t blk_lookup_devt(const char *name, int partno)
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
 
-		if (!strcmp(dev->bus_id, name) && partno < disk->minors) {
-			devt = MKDEV(MAJOR(dev->devt),
-				     MINOR(dev->devt) + partno);
-			break;
+		if (strcmp(dev->bus_id, name))
+			continue;
+		if (partno < 0 || partno > disk_max_parts(disk))
+			continue;
+
+		if (partno == 0)
+			devt = disk_devt(disk);
+		else {
+			struct hd_struct *part = disk->part[partno - 1];
+
+			if (!part || !part->nr_sects)
+				continue;
+
+			devt = part_devt(part);
 		}
+		break;
 	}
 	class_dev_iter_exit(&iter);
 	return devt;
@@ -755,7 +800,7 @@ void set_disk_ro(struct gendisk *disk, int flag)
 {
 	int i;
 	disk->policy = flag;
-	for (i = 0; i < disk->minors - 1; i++)
+	for (i = 0; i < disk_max_parts(disk); i++)
 		if (disk->part[i]) disk->part[i]->policy = flag;
 }
 
diff --git a/block/ioctl.c b/block/ioctl.c
index d77f5e280a6e..403f7d7e0c28 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -29,7 +29,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	if (bdev != bdev->bd_contains)
 		return -EINVAL;
 	partno = p.pno;
-	if (partno <= 0 || partno >= disk->minors)
+	if (partno <= 0 || partno > disk_max_parts(disk))
 		return -EINVAL;
 	switch (a.op) {
 		case BLKPG_ADD_PARTITION:
@@ -47,7 +47,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			mutex_lock(&bdev->bd_mutex);
 
 			/* overlap? */
-			for (i = 0; i < disk->minors - 1; i++) {
+			for (i = 0; i < disk_max_parts(disk); i++) {
 				struct hd_struct *s = disk->part[i];
 
 				if (!s)
@@ -96,7 +96,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 	struct gendisk *disk = bdev->bd_disk;
 	int res;
 
-	if (disk->minors == 1 || bdev != bdev->bd_contains)
+	if (!disk_max_parts(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 29b7a648cc6e..e1a90bbb4747 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2911,7 +2911,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 	if (!disk->queue)
 		goto out_mem2;
 
-	pd->pkt_dev = MKDEV(disk->major, disk->first_minor);
+	pd->pkt_dev = MKDEV(pktdev_major, idx);
 	ret = pkt_new_dev(pd, dev);
 	if (ret)
 		goto out_new_dev;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4b0d6c7f4c66..936466f62afd 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -541,7 +541,7 @@ static int ps3disk_remove(struct ps3_system_bus_device *_dev)
 	struct ps3disk_private *priv = dev->sbd.core.driver_data;
 
 	mutex_lock(&ps3disk_mask_mutex);
-	__clear_bit(priv->gendisk->first_minor / PS3DISK_MINORS,
+	__clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,
 		    &ps3disk_mask);
 	mutex_unlock(&ps3disk_mask_mutex);
 	del_gendisk(priv->gendisk);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7ce1ac4baa6d..6af435b89867 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -661,10 +661,10 @@ void add_disk_randomness(struct gendisk *disk)
 	if (!disk || !disk->random)
 		return;
 	/* first major is 1, so we get >= 0x200 here */
-	DEBUG_ENT("disk event %d:%d\n", disk->major, disk->first_minor);
+	DEBUG_ENT("disk event %d:%d\n",
+		  MAJOR(disk_devt(disk)), MINOR(disk_devt(disk)));
 
-	add_timer_randomness(disk->random,
-			     0x100 + MKDEV(disk->major, disk->first_minor));
+	add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
 }
 #endif
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index b262c0042de3..c3de311117a1 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -426,7 +426,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
 				old_nl->next = (uint32_t) ((void *) nl -
 							   (void *) old_nl);
 			disk = dm_disk(hc->md);
-			nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
+			nl->dev = huge_encode_dev(disk_devt(disk));
 			nl->next = 0;
 			strcpy(nl->name, hc->name);
 
@@ -539,7 +539,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	if (dm_suspended(md))
 		param->flags |= DM_SUSPEND_FLAG;
 
-	param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
+	param->dev = huge_encode_dev(disk_devt(disk));
 
 	/*
 	 * Yes, this will be out of date by the time it gets back
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4de90ab3968b..b745d8ac625b 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -284,8 +284,8 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 
 	memset(major_minor, 0, sizeof(major_minor));
 	sprintf(major_minor, "%d:%d",
-		bio->bi_bdev->bd_disk->major,
-		bio->bi_bdev->bd_disk->first_minor);
+		MAJOR(disk_devt(bio->bi_bdev->bd_disk)),
+		MINOR(disk_devt(bio->bi_bdev->bd_disk)));
 
 	/*
 	 * Test to see which stripe drive triggered the event
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ace998ce59f6..a78caad29996 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1146,7 +1146,7 @@ static void unlock_fs(struct mapped_device *md);
 
 static void free_dev(struct mapped_device *md)
 {
-	int minor = md->disk->first_minor;
+	int minor = MINOR(disk_devt(md->disk));
 
 	if (md->suspended_bdev) {
 		unlock_fs(md);
@@ -1267,7 +1267,7 @@ static struct mapped_device *dm_find_md(dev_t dev)
 
 	md = idr_find(&_minor_idr, minor);
 	if (md && (md == MINOR_ALLOCED ||
-		   (dm_disk(md)->first_minor != minor) ||
+		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
 		   test_bit(DMF_FREEING, &md->flags))) {
 		md = NULL;
 		goto out;
@@ -1318,7 +1318,8 @@ void dm_put(struct mapped_device *md)
 
 	if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
 		map = dm_get_table(md);
-		idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
+		idr_replace(&_minor_idr, MINOR_ALLOCED,
+			    MINOR(disk_devt(dm_disk(md))));
 		set_bit(DMF_FREEING, &md->flags);
 		spin_unlock(&_minor_lock);
 		if (!dm_suspended(md)) {
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index d2d2318dafa4..82bf649ef138 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -197,7 +197,7 @@ static int mspro_block_bd_open(struct inode *inode, struct file *filp)
 static int mspro_block_disk_release(struct gendisk *disk)
 {
 	struct mspro_block_data *msb = disk->private_data;
-	int disk_id = disk->first_minor >> MSPRO_BLOCK_PART_SHIFT;
+	int disk_id = MINOR(disk_devt(disk)) >> MSPRO_BLOCK_PART_SHIFT;
 
 	mutex_lock(&mspro_block_disk_lock);
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index ebc8b9d77613..97156b689e82 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -83,7 +83,7 @@ static void mmc_blk_put(struct mmc_blk_data *md)
 	mutex_lock(&open_lock);
 	md->usage--;
 	if (md->usage == 0) {
-		int devidx = md->disk->first_minor >> MMC_SHIFT;
+		int devidx = MINOR(disk_devt(md->disk)) >> MMC_SHIFT;
 		__clear_bit(devidx, dev_use);
 
 		put_disk(md->disk);
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 03c0e40a92ff..e3b5c4d3036e 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -76,7 +76,8 @@ dasd_devices_show(struct seq_file *m, void *v)
 	/* Print kdev. */
 	if (block->gdp)
 		seq_printf(m, " at (%3d:%6d)",
-			   block->gdp->major, block->gdp->first_minor);
+			   MAJOR(disk_devt(block->gdp)),
+			   MINOR(disk_devt(block->gdp)));
 	else
 		seq_printf(m, "  at (???:??????)");
 	/* Print device name. */
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 711b3004b3e6..9481e4a3f76e 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -114,7 +114,7 @@ dcssblk_assign_free_minor(struct dcssblk_dev_info *dev_info)
 		found = 0;
 		// test if minor available
 		list_for_each_entry(entry, &dcssblk_devices, lh)
-			if (minor == entry->gd->first_minor)
+			if (minor == MINOR(disk_devt(entry->gd)))
 				found++;
 		if (!found) break; // got unused minor
 	}
@@ -397,7 +397,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 		goto unload_seg;
 	}
 	sprintf(dev_info->gd->disk_name, "dcssblk%d",
-		dev_info->gd->first_minor);
+		MINOR(disk_devt(dev_info->gd)));
 	list_add_tail(&dev_info->lh, &dcssblk_devices);
 
 	if (!try_module_get(THIS_MODULE)) {
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 27f5bfd1def3..8dbe3798d5fd 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -878,7 +878,7 @@ static void sr_kref_release(struct kref *kref)
 	struct gendisk *disk = cd->disk;
 
 	spin_lock(&sr_index_lock);
-	clear_bit(disk->first_minor, sr_index_bits);
+	clear_bit(MINOR(disk_devt(disk)), sr_index_bits);
 	spin_unlock(&sr_index_lock);
 
 	unregister_cdrom(&cd->cdi);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index de0776cd7215..72e0a2887cb7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev)
 
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (bdev->bd_disk->minors > 1)
+	if (disk_max_parts(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 	return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b86aab1b0df6..e77fa144a07d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -134,7 +134,11 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
+	int partno = 0;
+
+	if (bdev->bd_part)
+		partno = bdev->bd_part->partno;
+
 	return disk_name(bdev->bd_disk, partno, buf);
 }
 
@@ -169,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
-	state->limit = hd->minors;
+	state->limit = disk_max_parts(hd) + 1;
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
@@ -416,7 +420,6 @@ void register_disk(struct gendisk *disk)
 	int err;
 
 	disk->dev.parent = disk->driverfs_dev;
-	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
 
 	strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
@@ -440,7 +443,7 @@ void register_disk(struct gendisk *disk)
 	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
-	if (disk->minors == 1)
+	if (!disk_max_parts(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
@@ -463,8 +466,8 @@ exit:
 	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
-	for (i = 1; i < disk->minors; i++) {
-		p = disk->part[i-1];
+	for (i = 0; i < disk_max_parts(disk); i++) {
+		p = disk->part[i];
 		if (!p || !p->nr_sects)
 			continue;
 		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
@@ -482,7 +485,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (res)
 		return res;
 	bdev->bd_invalidated = 0;
-	for (p = 1; p < disk->minors; p++)
+	for (p = 1; p <= disk_max_parts(disk); p++)
 		delete_partition(disk, p);
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
@@ -545,7 +548,7 @@ void del_gendisk(struct gendisk *disk)
 	int p;
 
 	/* invalidate stuff */
-	for (p = disk->minors - 1; p > 0; p--) {
+	for (p = disk_max_parts(disk); p > 0; p--) {
 		invalidate_partition(disk, p);
 		delete_partition(disk, p);
 	}
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index d1723c0a8600..0ff75329199c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -111,10 +111,14 @@ struct hd_struct {
 #define GENHD_FL_FAIL				64
 
 struct gendisk {
+	/* major, first_minor and minors are input parameters only,
+	 * don't use directly.  Use disk_devt() and disk_max_parts().
+	 */
 	int major;			/* major number of driver */
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
+
 	char disk_name[32];		/* name of major driver */
 	struct hd_struct **part;	/* [indexed by minor - 1] */
 	struct block_device_operations *fops;
@@ -152,6 +156,21 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 	return NULL;
 }
 
+static inline int disk_max_parts(struct gendisk *disk)
+{
+	return disk->minors - 1;
+}
+
+static inline dev_t disk_devt(struct gendisk *disk)
+{
+	return disk->dev.devt;
+}
+
+static inline dev_t part_devt(struct hd_struct *part)
+{
+	return part->dev.devt;
+}
+
 /* 
  * Macros to operate on percpu disk statistics:
  *
@@ -163,7 +182,7 @@ static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp,
 {
 	struct hd_struct *part;
 	int i;
-	for (i = 0; i < gendiskp->minors - 1; i++) {
+	for (i = 0; i < disk_max_parts(gendiskp); i++) {
 		part = gendiskp->part[i];
 		if (part && part->start_sect <= sector
 		    && sector < part->start_sect + part->nr_sects)
@@ -366,6 +385,7 @@ extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
+extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
@@ -553,11 +573,6 @@ extern void blk_register_region(dev_t devt, unsigned long range,
 			void *data);
 extern void blk_unregister_region(dev_t devt, unsigned long range);
 
-static inline struct block_device *bdget_disk(struct gendisk *disk, int partno)
-{
-	return bdget(MKDEV(disk->major, disk->first_minor) + partno);
-}
-
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }

From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:03:02 +0200
Subject: [PATCH 036/132] block: fix disk->part[] dereferencing race

disk->part[] is protected by its matching bdev's lock.  However,
non-critical accesses like collecting stats and printing out sysfs and
proc information used to be performed without any locking.  As
partitions can come and go dynamically, partitions can go away
underneath those non-critical accesses.  As some of those accesses are
writes, this theoretically can lead to silent corruption.

This patch fixes the race by using RCU for the partition array and dev
reference counter to hold partitions.

* Rename disk->part[] to disk->__part[] to make sure no one outside
  genhd layer proper accesses it directly.

* Use RCU for disk->__part[] dereferencing.

* Implement disk_{get|put}_part() which can be used to get and put
  partitions from gendisk respectively.

* Iterators are implemented to help iterate through all partitions
  safely.

* Functions which require RCU readlock are marked with _rcu suffix.

* Use disk_put_part() in __blkdev_put() instead of directly putting
  the contained kobject.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c           |  20 +++-
 block/blk-merge.c          |   9 +-
 block/genhd.c              | 218 +++++++++++++++++++++++++++++++------
 block/ioctl.c              |  26 +++--
 drivers/block/aoe/aoecmd.c |   6 +-
 fs/block_dev.c             |  15 +--
 fs/partitions/check.c      |  70 +++++++-----
 include/linux/genhd.h      |  53 ++++++---
 8 files changed, 323 insertions(+), 94 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a0dc2e72fcbb..d6128d9ad601 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -60,7 +60,9 @@ static void drive_stat_acct(struct request *rq, int new_io)
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 
-	part = disk_map_sector(rq->rq_disk, rq->sector);
+	rcu_read_lock();
+
+	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
 	if (!new_io)
 		__all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
 	else {
@@ -71,6 +73,8 @@ static void drive_stat_acct(struct request *rq, int new_io)
 			part->in_flight++;
 		}
 	}
+
+	rcu_read_unlock();
 }
 
 void blk_queue_congestion_threshold(struct request_queue *q)
@@ -1557,12 +1561,14 @@ static int __end_that_request_first(struct request *req, int error,
 	}
 
 	if (blk_fs_request(req) && req->rq_disk) {
-		struct hd_struct *part =
-			disk_map_sector(req->rq_disk, req->sector);
 		const int rw = rq_data_dir(req);
+		struct hd_struct *part;
 
+		rcu_read_lock();
+		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 		all_stat_add(req->rq_disk, part, sectors[rw],
 				nr_bytes >> 9, req->sector);
+		rcu_read_unlock();
 	}
 
 	total_bytes = bio_nbytes = 0;
@@ -1746,7 +1752,11 @@ static void end_that_request_last(struct request *req, int error)
 	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
-		struct hd_struct *part = disk_map_sector(disk, req->sector);
+		struct hd_struct *part;
+
+		rcu_read_lock();
+
+		part = disk_map_sector_rcu(disk, req->sector);
 
 		__all_stat_inc(disk, part, ios[rw], req->sector);
 		__all_stat_add(disk, part, ticks[rw], duration, req->sector);
@@ -1756,6 +1766,8 @@ static void end_that_request_last(struct request *req, int error)
 			part_round_stats(part);
 			part->in_flight--;
 		}
+
+		rcu_read_unlock();
 	}
 
 	if (req->end_io)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9b17da698d7c..eb2a3ca58303 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -387,14 +387,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	elv_merge_requests(q, req, next);
 
 	if (req->rq_disk) {
-		struct hd_struct *part =
-			disk_map_sector(req->rq_disk, req->sector);
+		struct hd_struct *part;
+
+		rcu_read_lock();
+
+		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 		disk_round_stats(req->rq_disk);
 		req->rq_disk->in_flight--;
 		if (part) {
 			part_round_stats(part);
 			part->in_flight--;
 		}
+
+		rcu_read_unlock();
 	}
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
diff --git a/block/genhd.c b/block/genhd.c
index fa32d09fda24..b431d6543942 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -26,6 +26,158 @@ struct kobject *block_depr;
 
 static struct device_type disk_type;
 
+/**
+ * disk_get_part - get partition
+ * @disk: disk to look partition from
+ * @partno: partition number
+ *
+ * Look for partition @partno from @disk.  If found, increment
+ * reference count and return it.
+ *
+ * CONTEXT:
+ * Don't care.
+ *
+ * RETURNS:
+ * Pointer to the found partition on success, NULL if not found.
+ */
+struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
+{
+	struct hd_struct *part;
+
+	if (unlikely(partno < 1 || partno > disk_max_parts(disk)))
+		return NULL;
+	rcu_read_lock();
+	part = rcu_dereference(disk->__part[partno - 1]);
+	if (part)
+		get_device(&part->dev);
+	rcu_read_unlock();
+
+	return part;
+}
+EXPORT_SYMBOL_GPL(disk_get_part);
+
+/**
+ * disk_part_iter_init - initialize partition iterator
+ * @piter: iterator to initialize
+ * @disk: disk to iterate over
+ * @flags: DISK_PITER_* flags
+ *
+ * Initialize @piter so that it iterates over partitions of @disk.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
+			  unsigned int flags)
+{
+	piter->disk = disk;
+	piter->part = NULL;
+
+	if (flags & DISK_PITER_REVERSE)
+		piter->idx = disk_max_parts(piter->disk) - 1;
+	else
+		piter->idx = 0;
+
+	piter->flags = flags;
+}
+EXPORT_SYMBOL_GPL(disk_part_iter_init);
+
+/**
+ * disk_part_iter_next - proceed iterator to the next partition and return it
+ * @piter: iterator of interest
+ *
+ * Proceed @piter to the next partition and return it.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
+{
+	int inc, end;
+
+	/* put the last partition */
+	disk_put_part(piter->part);
+	piter->part = NULL;
+
+	rcu_read_lock();
+
+	/* determine iteration parameters */
+	if (piter->flags & DISK_PITER_REVERSE) {
+		inc = -1;
+		end = -1;
+	} else {
+		inc = 1;
+		end = disk_max_parts(piter->disk);
+	}
+
+	/* iterate to the next partition */
+	for (; piter->idx != end; piter->idx += inc) {
+		struct hd_struct *part;
+
+		part = rcu_dereference(piter->disk->__part[piter->idx]);
+		if (!part)
+			continue;
+		if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
+			continue;
+
+		get_device(&part->dev);
+		piter->part = part;
+		piter->idx += inc;
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return piter->part;
+}
+EXPORT_SYMBOL_GPL(disk_part_iter_next);
+
+/**
+ * disk_part_iter_exit - finish up partition iteration
+ * @piter: iter of interest
+ *
+ * Called when iteration is over.  Cleans up @piter.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void disk_part_iter_exit(struct disk_part_iter *piter)
+{
+	disk_put_part(piter->part);
+	piter->part = NULL;
+}
+EXPORT_SYMBOL_GPL(disk_part_iter_exit);
+
+/**
+ * disk_map_sector_rcu - map sector to partition
+ * @disk: gendisk of interest
+ * @sector: sector to map
+ *
+ * Find out which partition @sector maps to on @disk.  This is
+ * primarily used for stats accounting.
+ *
+ * CONTEXT:
+ * RCU read locked.  The returned partition pointer is valid only
+ * while preemption is disabled.
+ *
+ * RETURNS:
+ * Found partition on success, NULL if there's no matching partition.
+ */
+struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
+{
+	int i;
+
+	for (i = 0; i < disk_max_parts(disk); i++) {
+		struct hd_struct *part = rcu_dereference(disk->__part[i]);
+
+		if (part && part->start_sect <= sector &&
+		    sector < part->start_sect + part->nr_sects)
+			return part;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
+
 /*
  * Can be deleted altogether. Later.
  *
@@ -245,10 +397,12 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
 	if (partno == 0)
 		devt = disk_devt(disk);
 	else {
-		struct hd_struct *part = disk->part[partno - 1];
+		struct hd_struct *part;
 
+		part = disk_get_part(disk, partno);
 		if (part && part->nr_sects)
 			devt = part_devt(part);
+		disk_put_part(part);
 	}
 
 	if (likely(devt != MKDEV(0, 0)))
@@ -270,8 +424,9 @@ void __init printk_all_partitions(void)
 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
+		struct disk_part_iter piter;
+		struct hd_struct *part;
 		char buf[BDEVNAME_SIZE];
-		int n;
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -298,16 +453,13 @@ void __init printk_all_partitions(void)
 			printk(" (driver?)\n");
 
 		/* now show the partitions */
-		for (n = 0; n < disk_max_parts(disk); ++n) {
-			struct hd_struct *part = disk->part[n];
-
-			if (!part || !part->nr_sects)
-				continue;
+		disk_part_iter_init(&piter, disk, 0);
+		while ((part = disk_part_iter_next(&piter)))
 			printk("  %02x%02x %10llu %s\n",
 			       MAJOR(part_devt(part)), MINOR(part_devt(part)),
 			       (unsigned long long)part->nr_sects >> 1,
 			       disk_name(disk, part->partno, buf));
-		}
+		disk_part_iter_exit(&piter);
 	}
 	class_dev_iter_exit(&iter);
 }
@@ -371,7 +523,8 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 static int show_partition(struct seq_file *seqf, void *v)
 {
 	struct gendisk *sgp = v;
-	int n;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
@@ -386,17 +539,14 @@ static int show_partition(struct seq_file *seqf, void *v)
 		MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
 		(unsigned long long)get_capacity(sgp) >> 1,
 		disk_name(sgp, 0, buf));
-	for (n = 0; n < disk_max_parts(sgp); n++) {
-		struct hd_struct *part = sgp->part[n];
-		if (!part)
-			continue;
-		if (part->nr_sects == 0)
-			continue;
+
+	disk_part_iter_init(&piter, sgp, 0);
+	while ((part = disk_part_iter_next(&piter)))
 		seq_printf(seqf, "%4d  %4d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
 			   (unsigned long long)part->nr_sects >> 1,
 			   disk_name(sgp, part->partno, buf));
-	}
+	disk_part_iter_exit(&piter);
 
 	return 0;
 }
@@ -571,7 +721,7 @@ static void disk_release(struct device *dev)
 	struct gendisk *disk = dev_to_disk(dev);
 
 	kfree(disk->random);
-	kfree(disk->part);
+	kfree(disk->__part);
 	free_disk_stats(disk);
 	kfree(disk);
 }
@@ -596,8 +746,9 @@ static struct device_type disk_type = {
 static int diskstats_show(struct seq_file *seqf, void *v)
 {
 	struct gendisk *gp = v;
+	struct disk_part_iter piter;
+	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
-	int n;
 
 	/*
 	if (&gp->dev.kobj.entry == block_class.devices.next)
@@ -624,12 +775,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
 
 	/* now show all non-0 size partitions of it */
-	for (n = 0; n < disk_max_parts(gp); n++) {
-		struct hd_struct *hd = gp->part[n];
-
-		if (!hd || !hd->nr_sects)
-			continue;
-
+	disk_part_iter_init(&piter, gp, 0);
+	while ((hd = disk_part_iter_next(&piter))) {
 		preempt_disable();
 		part_round_stats(hd);
 		preempt_enable();
@@ -650,6 +797,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
 	}
+	disk_part_iter_exit(&piter);
  
 	return 0;
 }
@@ -703,12 +851,16 @@ dev_t blk_lookup_devt(const char *name, int partno)
 		if (partno == 0)
 			devt = disk_devt(disk);
 		else {
-			struct hd_struct *part = disk->part[partno - 1];
+			struct hd_struct *part;
 
-			if (!part || !part->nr_sects)
+			part = disk_get_part(disk, partno);
+			if (!part || !part->nr_sects) {
+				disk_put_part(part);
 				continue;
+			}
 
 			devt = part_devt(part);
+			disk_put_part(part);
 		}
 		break;
 	}
@@ -735,9 +887,9 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 		}
 		if (minors > 1) {
 			int size = (minors - 1) * sizeof(struct hd_struct *);
-			disk->part = kmalloc_node(size,
+			disk->__part = kmalloc_node(size,
 				GFP_KERNEL | __GFP_ZERO, node_id);
-			if (!disk->part) {
+			if (!disk->__part) {
 				free_disk_stats(disk);
 				kfree(disk);
 				return NULL;
@@ -798,10 +950,14 @@ EXPORT_SYMBOL(set_device_ro);
 
 void set_disk_ro(struct gendisk *disk, int flag)
 {
-	int i;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+
 	disk->policy = flag;
-	for (i = 0; i < disk_max_parts(disk); i++)
-		if (disk->part[i]) disk->part[i]->policy = flag;
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		part->policy = flag;
+	disk_part_iter_exit(&piter);
 }
 
 EXPORT_SYMBOL(set_disk_ro);
diff --git a/block/ioctl.c b/block/ioctl.c
index 403f7d7e0c28..a5f672ad55f6 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -12,11 +12,12 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 {
 	struct block_device *bdevp;
 	struct gendisk *disk;
+	struct hd_struct *part;
 	struct blkpg_ioctl_arg a;
 	struct blkpg_partition p;
+	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int i;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -47,28 +48,33 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			mutex_lock(&bdev->bd_mutex);
 
 			/* overlap? */
-			for (i = 0; i < disk_max_parts(disk); i++) {
-				struct hd_struct *s = disk->part[i];
-
-				if (!s)
-					continue;
-				if (!(start+length <= s->start_sect ||
-				      start >= s->start_sect + s->nr_sects)) {
+			disk_part_iter_init(&piter, disk,
+					    DISK_PITER_INCL_EMPTY);
+			while ((part = disk_part_iter_next(&piter))) {
+				if (!(start + length <= part->start_sect ||
+				      start >= part->start_sect + part->nr_sects)) {
+					disk_part_iter_exit(&piter);
 					mutex_unlock(&bdev->bd_mutex);
 					return -EBUSY;
 				}
 			}
+			disk_part_iter_exit(&piter);
+
 			/* all seems OK */
 			err = add_partition(disk, partno, start, length,
 					    ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
 			return err;
 		case BLKPG_DEL_PARTITION:
-			if (!disk->part[partno - 1])
+			part = disk_get_part(disk, partno);
+			if (!part)
 				return -ENXIO;
-			bdevp = bdget_disk(disk, partno);
+
+			bdevp = bdget(part_devt(part));
+			disk_put_part(part);
 			if (!bdevp)
 				return -ENOMEM;
+
 			mutex_lock(&bdevp->bd_mutex);
 			if (bdevp->bd_openers) {
 				mutex_unlock(&bdevp->bd_mutex);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 885d1409521f..84c03d65dcc5 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -757,11 +757,15 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 	const int rw = bio_data_dir(bio);
 	struct hd_struct *part;
 
-	part = disk_map_sector(disk, sector);
+	rcu_read_lock();
+
+	part = disk_map_sector_rcu(disk, sector);
 	all_stat_inc(disk, part, ios[rw], sector);
 	all_stat_add(disk, part, ticks[rw], duration, sector);
 	all_stat_add(disk, part, sectors[rw], n_sect, sector);
 	all_stat_add(disk, part, io_ticks, duration, sector);
+
+	rcu_read_unlock();
 }
 
 void
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 72e0a2887cb7..2f2873b9a041 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -929,6 +929,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
 	struct module *owner = NULL;
 	struct gendisk *disk;
+	struct hd_struct *part = NULL;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -978,7 +979,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (bdev->bd_invalidated)
 				rescan_partitions(disk, bdev);
 		} else {
-			struct hd_struct *p;
 			struct block_device *whole;
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
@@ -989,16 +989,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			if (ret)
 				goto out_first;
 			bdev->bd_contains = whole;
-			p = disk->part[partno - 1];
+			part = disk_get_part(disk, partno);
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
-			if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+			if (!(disk->flags & GENHD_FL_UP) ||
+			    !part || !part->nr_sects) {
 				ret = -ENXIO;
 				goto out_first;
 			}
-			kobject_get(&p->dev.kobj);
-			bdev->bd_part = p;
-			bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+			bdev->bd_part = part;
+			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
 		}
 	} else {
 		put_disk(disk);
@@ -1027,6 +1027,7 @@ out_first:
 		__blkdev_put(bdev->bd_contains, 1);
 	bdev->bd_contains = NULL;
 	put_disk(disk);
+	disk_put_part(part);
 	module_put(owner);
 out:
 	mutex_unlock(&bdev->bd_mutex);
@@ -1119,7 +1120,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 		module_put(owner);
 
 		if (bdev->bd_contains != bdev) {
-			kobject_put(&bdev->bd_part->dev.kobj);
+			disk_put_part(bdev->bd_part);
 			bdev->bd_part = NULL;
 		}
 		bdev->bd_disk = NULL;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e77fa144a07d..96c8bf41e455 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -314,19 +314,29 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 	kobject_put(k);
 }
 
+static void delete_partition_rcu_cb(struct rcu_head *head)
+{
+	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+
+	part->start_sect = 0;
+	part->nr_sects = 0;
+	part_stat_set_all(part, 0);
+	put_device(&part->dev);
+}
+
 void delete_partition(struct gendisk *disk, int partno)
 {
-	struct hd_struct *p = disk->part[partno - 1];
+	struct hd_struct *part;
 
-	if (!p)
+	part = disk->__part[partno-1];
+	if (!part)
 		return;
-	disk->part[partno - 1] = NULL;
-	p->start_sect = 0;
-	p->nr_sects = 0;
-	part_stat_set_all(p, 0);
-	kobject_put(p->holder_dir);
-	device_del(&p->dev);
-	put_device(&p->dev);
+
+	rcu_assign_pointer(disk->__part[partno-1], NULL);
+	kobject_put(part->holder_dir);
+	device_del(&part->dev);
+
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
 
 static ssize_t whole_disk_show(struct device *dev,
@@ -343,7 +353,7 @@ int add_partition(struct gendisk *disk, int partno,
 	struct hd_struct *p;
 	int err;
 
-	if (disk->part[partno - 1])
+	if (disk->__part[partno - 1])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -391,7 +401,8 @@ int add_partition(struct gendisk *disk, int partno,
 	}
 
 	/* everything is up and running, commence */
-	disk->part[partno - 1] = p;
+	INIT_RCU_HEAD(&p->rcu_head);
+	rcu_assign_pointer(disk->__part[partno - 1], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!disk->dev.uevent_suppress)
@@ -414,9 +425,9 @@ out_put:
 void register_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	char *s;
-	int i;
-	struct hd_struct *p;
 	int err;
 
 	disk->dev.parent = disk->driverfs_dev;
@@ -466,16 +477,16 @@ exit:
 	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
-	for (i = 0; i < disk_max_parts(disk); i++) {
-		p = disk->part[i];
-		if (!p || !p->nr_sects)
-			continue;
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
-	}
+	disk_part_iter_init(&piter, disk, 0);
+	while ((part = disk_part_iter_next(&piter)))
+		kobject_uevent(&part->dev.kobj, KOBJ_ADD);
+	disk_part_iter_exit(&piter);
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 {
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 	struct parsed_partitions *state;
 	int p, res;
 
@@ -485,8 +496,12 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	if (res)
 		return res;
 	bdev->bd_invalidated = 0;
-	for (p = 1; p <= disk_max_parts(disk); p++)
-		delete_partition(disk, p);
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		delete_partition(disk, part->partno);
+	disk_part_iter_exit(&piter);
+
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
@@ -545,13 +560,18 @@ EXPORT_SYMBOL(read_dev_sector);
 
 void del_gendisk(struct gendisk *disk)
 {
-	int p;
+	struct disk_part_iter piter;
+	struct hd_struct *part;
 
 	/* invalidate stuff */
-	for (p = disk_max_parts(disk); p > 0; p--) {
-		invalidate_partition(disk, p);
-		delete_partition(disk, p);
+	disk_part_iter_init(&piter, disk,
+			     DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+	while ((part = disk_part_iter_next(&piter))) {
+		invalidate_partition(disk, part->partno);
+		delete_partition(disk, part->partno);
 	}
+	disk_part_iter_exit(&piter);
+
 	invalidate_partition(disk, 0);
 	disk->capacity = 0;
 	disk->flags &= ~GENHD_FL_UP;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0ff75329199c..7fbba19e076b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -11,6 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/kdev_t.h>
+#include <linux/rcupdate.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -100,6 +101,7 @@ struct hd_struct {
 #else
 	struct disk_stats dkstats;
 #endif
+	struct rcu_head rcu_head;
 };
 
 #define GENHD_FL_REMOVABLE			1
@@ -120,7 +122,14 @@ struct gendisk {
                                          * disks that can't be partitioned. */
 
 	char disk_name[32];		/* name of major driver */
-	struct hd_struct **part;	/* [indexed by minor - 1] */
+
+	/* Array of pointers to partitions indexed by partno - 1.
+	 * Protected with matching bdev lock but stat and other
+	 * non-critical accesses use RCU.  Always access through
+	 * helpers.
+	 */
+	struct hd_struct **__part;
+
 	struct block_device_operations *fops;
 	struct request_queue *queue;
 	void *private_data;
@@ -171,25 +180,41 @@ static inline dev_t part_devt(struct hd_struct *part)
 	return part->dev.devt;
 }
 
+extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
+
+static inline void disk_put_part(struct hd_struct *part)
+{
+	if (likely(part))
+		put_device(&part->dev);
+}
+
+/*
+ * Smarter partition iterator without context limits.
+ */
+#define DISK_PITER_REVERSE	(1 << 0) /* iterate in the reverse direction */
+#define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
+
+struct disk_part_iter {
+	struct gendisk		*disk;
+	struct hd_struct	*part;
+	int			idx;
+	unsigned int		flags;
+};
+
+extern void disk_part_iter_init(struct disk_part_iter *piter,
+				 struct gendisk *disk, unsigned int flags);
+extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
+extern void disk_part_iter_exit(struct disk_part_iter *piter);
+
+extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
+					     sector_t sector);
+
 /* 
  * Macros to operate on percpu disk statistics:
  *
  * The __ variants should only be called in critical sections. The full
  * variants disable/enable preemption.
  */
-static inline struct hd_struct *disk_map_sector(struct gendisk *gendiskp,
-						sector_t sector)
-{
-	struct hd_struct *part;
-	int i;
-	for (i = 0; i < disk_max_parts(gendiskp); i++) {
-		part = gendiskp->part[i];
-		if (part && part->start_sect <= sector
-		    && sector < part->start_sect + part->nr_sects)
-			return part;
-	}
-	return NULL;
-}
 
 #ifdef	CONFIG_SMP
 #define __disk_stat_add(gendiskp, field, addnd) 	\

From c9959059161ddd7bf4670cf47367033d6b2f79c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:21 +0900
Subject: [PATCH 037/132] block: fix diskstats access

There are two variants of stat functions - ones prefixed with double
underbars which don't care about preemption and ones without which
disable preemption before manipulating per-cpu counters.  It's unclear
whether the underbarred ones assume that preemtion is disabled on
entry as some callers don't do that.

This patch unifies diskstats access by implementing disk_stat_lock()
and disk_stat_unlock() which take care of both RCU (for partition
access) and preemption (for per-cpu counter access).  diskstats access
should always be enclosed between the two functions.  As such, there's
no need for the versions which disables preemption.  They're removed
and double underbars ones are renamed to drop the underbars.  As an
extra argument is added, there's no danger of using the old version
unconverted.

disk_stat_lock() uses get_cpu() and returns the cpu index and all
diskstat functions which access per-cpu counters now has @cpu
argument to help RT.

This change adds RCU or preemption operations at some places but also
collapses several preemption ops into one at others.  Overall, the
performance difference should be negligible as all involved ops are
very lightweight per-cpu ones.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c           |  52 +++++++-------
 block/blk-merge.c          |  11 +--
 block/genhd.c              |  20 +++---
 drivers/block/aoe/aoecmd.c |  15 +++--
 drivers/md/dm.c            |  26 ++++---
 drivers/md/linear.c        |   7 +-
 drivers/md/multipath.c     |   7 +-
 drivers/md/raid0.c         |   7 +-
 drivers/md/raid1.c         |   8 ++-
 drivers/md/raid10.c        |   7 +-
 drivers/md/raid5.c         |   8 ++-
 fs/partitions/check.c      |   7 +-
 include/linux/genhd.h      | 135 +++++++++++++++----------------------
 13 files changed, 156 insertions(+), 154 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d6128d9ad601..e0a5ee36849c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -56,25 +56,26 @@ static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
+	int cpu;
 
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 
-	rcu_read_lock();
-
+	cpu = disk_stat_lock();
 	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
+
 	if (!new_io)
-		__all_stat_inc(rq->rq_disk, part, merges[rw], rq->sector);
+		all_stat_inc(cpu, rq->rq_disk, part, merges[rw], rq->sector);
 	else {
-		disk_round_stats(rq->rq_disk);
+		disk_round_stats(cpu, rq->rq_disk);
 		rq->rq_disk->in_flight++;
 		if (part) {
-			part_round_stats(part);
+			part_round_stats(cpu, part);
 			part->in_flight++;
 		}
 	}
 
-	rcu_read_unlock();
+	disk_stat_unlock();
 }
 
 void blk_queue_congestion_threshold(struct request_queue *q)
@@ -997,7 +998,7 @@ static inline void add_request(struct request_queue *q, struct request *req)
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
-void disk_round_stats(struct gendisk *disk)
+void disk_round_stats(int cpu, struct gendisk *disk)
 {
 	unsigned long now = jiffies;
 
@@ -1005,15 +1006,15 @@ void disk_round_stats(struct gendisk *disk)
 		return;
 
 	if (disk->in_flight) {
-		__disk_stat_add(disk, time_in_queue,
-				disk->in_flight * (now - disk->stamp));
-		__disk_stat_add(disk, io_ticks, (now - disk->stamp));
+		disk_stat_add(cpu, disk, time_in_queue,
+			      disk->in_flight * (now - disk->stamp));
+		disk_stat_add(cpu, disk, io_ticks, (now - disk->stamp));
 	}
 	disk->stamp = now;
 }
 EXPORT_SYMBOL_GPL(disk_round_stats);
 
-void part_round_stats(struct hd_struct *part)
+void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 
@@ -1021,9 +1022,9 @@ void part_round_stats(struct hd_struct *part)
 		return;
 
 	if (part->in_flight) {
-		__part_stat_add(part, time_in_queue,
-				part->in_flight * (now - part->stamp));
-		__part_stat_add(part, io_ticks, (now - part->stamp));
+		part_stat_add(cpu, part, time_in_queue,
+			      part->in_flight * (now - part->stamp));
+		part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
@@ -1563,12 +1564,13 @@ static int __end_that_request_first(struct request *req, int error,
 	if (blk_fs_request(req) && req->rq_disk) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
+		int cpu;
 
-		rcu_read_lock();
+		cpu = disk_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
-		all_stat_add(req->rq_disk, part, sectors[rw],
-				nr_bytes >> 9, req->sector);
-		rcu_read_unlock();
+		all_stat_add(cpu, req->rq_disk, part, sectors[rw],
+			     nr_bytes >> 9, req->sector);
+		disk_stat_unlock();
 	}
 
 	total_bytes = bio_nbytes = 0;
@@ -1753,21 +1755,21 @@ static void end_that_request_last(struct request *req, int error)
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
+		int cpu;
 
-		rcu_read_lock();
-
+		cpu = disk_stat_lock();
 		part = disk_map_sector_rcu(disk, req->sector);
 
-		__all_stat_inc(disk, part, ios[rw], req->sector);
-		__all_stat_add(disk, part, ticks[rw], duration, req->sector);
-		disk_round_stats(disk);
+		all_stat_inc(cpu, disk, part, ios[rw], req->sector);
+		all_stat_add(cpu, disk, part, ticks[rw], duration, req->sector);
+		disk_round_stats(cpu, disk);
 		disk->in_flight--;
 		if (part) {
-			part_round_stats(part);
+			part_round_stats(cpu, part);
 			part->in_flight--;
 		}
 
-		rcu_read_unlock();
+		disk_stat_unlock();
 	}
 
 	if (req->end_io)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eb2a3ca58303..d926a24bf1fd 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -388,18 +388,19 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 
 	if (req->rq_disk) {
 		struct hd_struct *part;
+		int cpu;
 
-		rcu_read_lock();
-
+		cpu = disk_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
-		disk_round_stats(req->rq_disk);
+
+		disk_round_stats(cpu, req->rq_disk);
 		req->rq_disk->in_flight--;
 		if (part) {
-			part_round_stats(part);
+			part_round_stats(cpu, part);
 			part->in_flight--;
 		}
 
-		rcu_read_unlock();
+		disk_stat_unlock();
 	}
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
diff --git a/block/genhd.c b/block/genhd.c
index b431d6543942..430626e440f0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -633,10 +633,11 @@ static ssize_t disk_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
 	struct gendisk *disk = dev_to_disk(dev);
+	int cpu;
 
-	preempt_disable();
-	disk_round_stats(disk);
-	preempt_enable();
+	cpu = disk_stat_lock();
+	disk_round_stats(cpu, disk);
+	disk_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -749,6 +750,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct disk_part_iter piter;
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
+	int cpu;
 
 	/*
 	if (&gp->dev.kobj.entry == block_class.devices.next)
@@ -758,9 +760,9 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 				"\n\n");
 	*/
  
-	preempt_disable();
-	disk_round_stats(gp);
-	preempt_enable();
+	cpu = disk_stat_lock();
+	disk_round_stats(cpu, gp);
+	disk_stat_unlock();
 	seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)),
 		disk_name(gp, 0, buf),
@@ -777,9 +779,9 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	/* now show all non-0 size partitions of it */
 	disk_part_iter_init(&piter, gp, 0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		preempt_disable();
-		part_round_stats(hd);
-		preempt_enable();
+		cpu = disk_stat_lock();
+		part_round_stats(cpu, hd);
+		disk_stat_unlock();
 		seq_printf(seqf, "%4d %4d %s %lu %lu %llu "
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 84c03d65dcc5..17eed8c025d0 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -756,16 +756,17 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 	unsigned long n_sect = bio->bi_size >> 9;
 	const int rw = bio_data_dir(bio);
 	struct hd_struct *part;
+	int cpu;
 
-	rcu_read_lock();
-
+	cpu = disk_stat_lock();
 	part = disk_map_sector_rcu(disk, sector);
-	all_stat_inc(disk, part, ios[rw], sector);
-	all_stat_add(disk, part, ticks[rw], duration, sector);
-	all_stat_add(disk, part, sectors[rw], n_sect, sector);
-	all_stat_add(disk, part, io_ticks, duration, sector);
 
-	rcu_read_unlock();
+	all_stat_inc(cpu, disk, part, ios[rw], sector);
+	all_stat_add(cpu, disk, part, ticks[rw], duration, sector);
+	all_stat_add(cpu, disk, part, sectors[rw], n_sect, sector);
+	all_stat_add(cpu, disk, part, io_ticks, duration, sector);
+
+	disk_stat_unlock();
 }
 
 void
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a78caad29996..653624792eaf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -377,12 +377,13 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
+	int cpu;
 
 	io->start_time = jiffies;
 
-	preempt_disable();
-	disk_round_stats(dm_disk(md));
-	preempt_enable();
+	cpu = disk_stat_lock();
+	disk_round_stats(cpu, dm_disk(md));
+	disk_stat_unlock();
 	dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
 }
 
@@ -391,15 +392,15 @@ static int end_io_acct(struct dm_io *io)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending;
+	int pending, cpu;
 	int rw = bio_data_dir(bio);
 
-	preempt_disable();
-	disk_round_stats(dm_disk(md));
-	preempt_enable();
-	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
+	cpu = disk_stat_lock();
+	disk_round_stats(cpu, dm_disk(md));
+	disk_stat_add(cpu, dm_disk(md), ticks[rw], duration);
+	disk_stat_unlock();
 
-	disk_stat_add(dm_disk(md), ticks[rw], duration);
+	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
 
 	return !pending;
 }
@@ -885,6 +886,7 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	int r = -EIO;
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
+	int cpu;
 
 	/*
 	 * There is no use in forwarding any barrier request since we can't
@@ -897,8 +899,10 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 
 	down_read(&md->io_lock);
 
-	disk_stat_inc(dm_disk(md), ios[rw]);
-	disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, dm_disk(md), ios[rw]);
+	disk_stat_add(cpu, dm_disk(md), sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	/*
 	 * If we're suspended we have to queue
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b1eebf88c209..00cbc8e47294 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -318,14 +318,17 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *tmp_dev;
 	sector_t block;
+	int cpu;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	tmp_dev = which_dev(mddev, bio->bi_sector);
 	block = bio->bi_sector >> 1;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index c4779ccba1c3..182f5a94cdc5 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -147,6 +147,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 	const int rw = bio_data_dir(bio);
+	int cpu;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, -EOPNOTSUPP);
@@ -158,8 +159,10 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	mp_bh->path = multipath_map(conf);
 	if (mp_bh->path < 0) {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 183610635661..e26030fa59ab 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -399,14 +399,17 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 	sector_t chunk;
 	sector_t block, rsect;
 	const int rw = bio_data_dir(bio);
+	int cpu;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	chunk_size = mddev->chunk_size >> 10;
 	chunk_sects = mddev->chunk_size >> 9;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b82030c265d..babb13036f93 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -779,7 +779,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
 	const int do_sync = bio_sync(bio);
-	int do_barriers;
+	int cpu, do_barriers;
 	mdk_rdev_t *blocked_rdev;
 
 	/*
@@ -804,8 +804,10 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	bitmap = mddev->bitmap;
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	/*
 	 * make_request() can abort the operation when READA is being
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d3b9aa096285..5ec80da0a9d7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -789,6 +789,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	mirror_info_t *mirror;
 	r10bio_t *r10_bio;
 	struct bio *read_bio;
+	int cpu;
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
@@ -843,8 +844,10 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	 */
 	wait_barrier(conf);
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
+	disk_stat_unlock();
 
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 37e546528f9c..5899f211515f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3387,7 +3387,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
+	int cpu, remaining;
 
 	if (unlikely(bio_barrier(bi))) {
 		bio_endio(bi, -EOPNOTSUPP);
@@ -3396,8 +3396,10 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 	md_write_start(mddev, bi);
 
-	disk_stat_inc(mddev->gendisk, ios[rw]);
-	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
+	cpu = disk_stat_lock();
+	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
+	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bi));
+	disk_stat_unlock();
 
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 96c8bf41e455..c442f0aadac3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,10 +219,11 @@ static ssize_t part_stat_show(struct device *dev,
 			      struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
+	int cpu;
 
-	preempt_disable();
-	part_round_stats(p);
-	preempt_enable();
+	cpu = disk_stat_lock();
+	part_round_stats(cpu, p);
+	disk_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 7fbba19e076b..ac8a901f2002 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -209,16 +209,24 @@ extern void disk_part_iter_exit(struct disk_part_iter *piter);
 extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 					     sector_t sector);
 
-/* 
+/*
  * Macros to operate on percpu disk statistics:
  *
- * The __ variants should only be called in critical sections. The full
- * variants disable/enable preemption.
+ * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters
+ * and should be called between disk_stat_lock() and
+ * disk_stat_unlock().
+ *
+ * part_stat_read() can be called at any time.
+ *
+ * part_stat_{add|set_all}() and {init|free}_part_stats are for
+ * internal use only.
  */
-
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define disk_stat_lock()	({ rcu_read_lock(); get_cpu(); })
+#define disk_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
+
+#define disk_stat_add(cpu, gendiskp, field, addnd)			\
+	(per_cpu_ptr(gendiskp->dkstats, cpu)->field += addnd)
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
@@ -229,7 +237,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 	res;								\
 })
 
-static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
+static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
+{
 	int i;
 
 	for_each_possible_cpu(i)
@@ -237,14 +246,14 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 				sizeof(struct disk_stats));
 }		
 
-#define __part_stat_add(part, field, addnd)				\
-	(per_cpu_ptr(part->dkstats, smp_processor_id())->field += addnd)
+#define part_stat_add(cpu, part, field, addnd)				\
+	(per_cpu_ptr(part->dkstats, cpu)->field += addnd)
 
-#define __all_stat_add(gendiskp, part, field, addnd, sector)	\
-({								\
-	if (part)						\
-		__part_stat_add(part, field, addnd);		\
-	__disk_stat_add(gendiskp, field, addnd);		\
+#define all_stat_add(cpu, gendiskp, part, field, addnd, sector)		\
+({									\
+	if (part)							\
+		part_stat_add(cpu, part, field, addnd);			\
+	disk_stat_add(cpu, gendiskp, field, addnd);			\
 })
 
 #define part_stat_read(part, field)					\
@@ -264,10 +273,13 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 		memset(per_cpu_ptr(part->dkstats, i), value,
 				sizeof(struct disk_stats));
 }
-				
+
 #else /* !CONFIG_SMP */
-#define __disk_stat_add(gendiskp, field, addnd) \
-				(gendiskp->dkstats.field += addnd)
+#define disk_stat_lock()	({ rcu_read_lock(); 0; })
+#define disk_stat_unlock()	rcu_read_unlock()
+
+#define disk_stat_add(cpu, gendiskp, field, addnd)			\
+	(gendiskp->dkstats.field += addnd)
 #define disk_stat_read(gendiskp, field)	(gendiskp->dkstats.field)
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
@@ -275,14 +287,14 @@ static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
 	memset(&gendiskp->dkstats, value, sizeof (struct disk_stats));
 }
 
-#define __part_stat_add(part, field, addnd) \
+#define part_stat_add(cpu, part, field, addnd)				\
 	(part->dkstats.field += addnd)
 
-#define __all_stat_add(gendiskp, part, field, addnd, sector)	\
-({								\
-	if (part)						\
-		part->dkstats.field += addnd;			\
-	__disk_stat_add(gendiskp, field, addnd);		\
+#define all_stat_add(cpu, gendiskp, part, field, addnd, sector)		\
+({									\
+	if (part)							\
+		part_stat_add(cpu, part, field, addnd);			\
+	disk_stat_add(cpu, gendiskp, field, addnd);			\
 })
 
 #define part_stat_read(part, field)	(part->dkstats.field)
@@ -294,63 +306,26 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 
 #endif /* CONFIG_SMP */
 
-#define disk_stat_add(gendiskp, field, addnd)			\
-	do {							\
-		preempt_disable();				\
-		__disk_stat_add(gendiskp, field, addnd);	\
-		preempt_enable();				\
-	} while (0)
+#define disk_stat_dec(cpu, gendiskp, field)				\
+	disk_stat_add(cpu, gendiskp, field, -1)
+#define disk_stat_inc(cpu, gendiskp, field)				\
+	disk_stat_add(cpu, gendiskp, field, 1)
+#define disk_stat_sub(cpu, gendiskp, field, subnd)			\
+	disk_stat_add(cpu, gendiskp, field, -subnd)
 
-#define __disk_stat_dec(gendiskp, field) __disk_stat_add(gendiskp, field, -1)
-#define disk_stat_dec(gendiskp, field) disk_stat_add(gendiskp, field, -1)
+#define part_stat_dec(cpu, gendiskp, field)				\
+	part_stat_add(cpu, gendiskp, field, -1)
+#define part_stat_inc(cpu, gendiskp, field)				\
+	part_stat_add(cpu, gendiskp, field, 1)
+#define part_stat_sub(cpu, gendiskp, field, subnd)			\
+	part_stat_add(cpu, gendiskp, field, -subnd)
 
-#define __disk_stat_inc(gendiskp, field) __disk_stat_add(gendiskp, field, 1)
-#define disk_stat_inc(gendiskp, field) disk_stat_add(gendiskp, field, 1)
-
-#define __disk_stat_sub(gendiskp, field, subnd) \
-		__disk_stat_add(gendiskp, field, -subnd)
-#define disk_stat_sub(gendiskp, field, subnd) \
-		disk_stat_add(gendiskp, field, -subnd)
-
-#define part_stat_add(gendiskp, field, addnd)		\
-	do {						\
-		preempt_disable();			\
-		__part_stat_add(gendiskp, field, addnd);\
-		preempt_enable();			\
-	} while (0)
-
-#define __part_stat_dec(gendiskp, field) __part_stat_add(gendiskp, field, -1)
-#define part_stat_dec(gendiskp, field) part_stat_add(gendiskp, field, -1)
-
-#define __part_stat_inc(gendiskp, field) __part_stat_add(gendiskp, field, 1)
-#define part_stat_inc(gendiskp, field) part_stat_add(gendiskp, field, 1)
-
-#define __part_stat_sub(gendiskp, field, subnd) \
-		__part_stat_add(gendiskp, field, -subnd)
-#define part_stat_sub(gendiskp, field, subnd) \
-		part_stat_add(gendiskp, field, -subnd)
-
-#define all_stat_add(gendiskp, part, field, addnd, sector)	\
-	do {							\
-		preempt_disable();				\
-		__all_stat_add(gendiskp, part, field, addnd, sector);	\
-		preempt_enable();				\
-	} while (0)
-
-#define __all_stat_dec(gendiskp, field, sector) \
-		__all_stat_add(gendiskp, field, -1, sector)
-#define all_stat_dec(gendiskp, field, sector) \
-		all_stat_add(gendiskp, field, -1, sector)
-
-#define __all_stat_inc(gendiskp, part, field, sector) \
-		__all_stat_add(gendiskp, part, field, 1, sector)
-#define all_stat_inc(gendiskp, part, field, sector) \
-		all_stat_add(gendiskp, part, field, 1, sector)
-
-#define __all_stat_sub(gendiskp, part, field, subnd, sector) \
-		__all_stat_add(gendiskp, part, field, -subnd, sector)
-#define all_stat_sub(gendiskp, part, field, subnd, sector) \
-		all_stat_add(gendiskp, part, field, -subnd, sector)
+#define all_stat_dec(cpu, gendiskp, field, sector)			\
+	all_stat_add(cpu, gendiskp, field, -1, sector)
+#define all_stat_inc(cpu, gendiskp, part, field, sector)		\
+	all_stat_add(cpu, gendiskp, part, field, 1, sector)
+#define all_stat_sub(cpu, gendiskp, part, field, subnd, sector)		\
+	all_stat_add(cpu, gendiskp, part, field, -subnd, sector)
 
 /* Inlines to alloc and free disk stats in struct gendisk */
 #ifdef  CONFIG_SMP
@@ -401,8 +376,8 @@ static inline void free_part_stats(struct hd_struct *part)
 #endif	/* CONFIG_SMP */
 
 /* drivers/block/ll_rw_blk.c */
-extern void disk_round_stats(struct gendisk *disk);
-extern void part_round_stats(struct hd_struct *part);
+extern void disk_round_stats(int cpu, struct gendisk *disk);
+extern void part_round_stats(int cpu, struct hd_struct *part);
 
 /* drivers/block/genhd.c */
 extern int get_blkdev_list(char *, int);

From bcce3de1be61e424deef35d1e86e86a35c4b6e65 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:22 +0900
Subject: [PATCH 038/132] block: implement extended dev numbers

Implement extended device numbers.  A block driver can tell block
layer that it wants to use extended device numbers.  After the usual
minor space is used up, block layer automatically allocates devt's
from EXT_BLOCK_MAJOR.

Currently only one major number is allocated for this but as the
allocation is strictly on-demand, ~1mil minor space under it should
suffice unless the system actually has more than ~1mil partitions and
if that ever happens adding more majors to the extended devt area is
easy.

Due to internal implementation issues, the first partition can't be
allocated on the extended area.  In other words, genhd->minors should
at least be 1.  This limitation will be lifted by later changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 120 ++++++++++++++++++++++++++++++++++++++++--
 fs/partitions/check.c |   9 +++-
 include/linux/genhd.h |  13 +++--
 include/linux/major.h |   2 +
 4 files changed, 135 insertions(+), 9 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 430626e440f0..7bbfed05cecb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -16,6 +16,7 @@
 #include <linux/kobj_map.h>
 #include <linux/buffer_head.h>
 #include <linux/mutex.h>
+#include <linux/idr.h>
 
 #include "blk.h"
 
@@ -24,6 +25,15 @@ static DEFINE_MUTEX(block_class_lock);
 struct kobject *block_depr;
 #endif
 
+/* for extended dynamic devt allocation, currently only one major is used */
+#define MAX_EXT_DEVT		(1 << MINORBITS)
+
+/* For extended devt allocation.  ext_devt_mutex prevents look up
+ * results from going away underneath its user.
+ */
+static DEFINE_MUTEX(ext_devt_mutex);
+static DEFINE_IDR(ext_devt_idr);
+
 static struct device_type disk_type;
 
 /**
@@ -288,6 +298,74 @@ EXPORT_SYMBOL(unregister_blkdev);
 
 static struct kobj_map *bdev_map;
 
+/**
+ * blk_alloc_devt - allocate a dev_t for a partition
+ * @part: partition to allocate dev_t for
+ * @gfp_mask: memory allocation flag
+ * @devt: out parameter for resulting dev_t
+ *
+ * Allocate a dev_t for block device.
+ *
+ * RETURNS:
+ * 0 on success, allocated dev_t is returned in *@devt.  -errno on
+ * failure.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
+{
+	struct gendisk *disk = part_to_disk(part);
+	int idx, rc;
+
+	/* in consecutive minor range? */
+	if (part->partno < disk->minors) {
+		*devt = MKDEV(disk->major, disk->first_minor + part->partno);
+		return 0;
+	}
+
+	/* allocate ext devt */
+	do {
+		if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL))
+			return -ENOMEM;
+		rc = idr_get_new(&ext_devt_idr, part, &idx);
+	} while (rc == -EAGAIN);
+
+	if (rc)
+		return rc;
+
+	if (idx > MAX_EXT_DEVT) {
+		idr_remove(&ext_devt_idr, idx);
+		return -EBUSY;
+	}
+
+	*devt = MKDEV(BLOCK_EXT_MAJOR, idx);
+	return 0;
+}
+
+/**
+ * blk_free_devt - free a dev_t
+ * @devt: dev_t to free
+ *
+ * Free @devt which was allocated using blk_alloc_devt().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void blk_free_devt(dev_t devt)
+{
+	might_sleep();
+
+	if (devt == MKDEV(0, 0))
+		return;
+
+	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
+		mutex_lock(&ext_devt_mutex);
+		idr_remove(&ext_devt_idr, MINOR(devt));
+		mutex_unlock(&ext_devt_mutex);
+	}
+}
+
 /*
  * Register device numbers dev..(dev+range-1)
  * range must be nonzero
@@ -371,10 +449,27 @@ void unlink_gendisk(struct gendisk *disk)
  */
 struct gendisk *get_gendisk(dev_t devt, int *partno)
 {
-	struct kobject *kobj = kobj_lookup(bdev_map, devt, partno);
-	struct device *dev = kobj_to_dev(kobj);
+	struct gendisk *disk = NULL;
 
-	return  kobj ? dev_to_disk(dev) : NULL;
+	if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
+		struct kobject *kobj;
+
+		kobj = kobj_lookup(bdev_map, devt, partno);
+		if (kobj)
+			disk = dev_to_disk(kobj_to_dev(kobj));
+	} else {
+		struct hd_struct *part;
+
+		mutex_lock(&ext_devt_mutex);
+		part = idr_find(&ext_devt_idr, MINOR(devt));
+		if (part && get_disk(part_to_disk(part))) {
+			*partno = part->partno;
+			disk = part_to_disk(part);
+		}
+		mutex_unlock(&ext_devt_mutex);
+	}
+
+	return disk;
 }
 
 /**
@@ -877,18 +972,30 @@ struct gendisk *alloc_disk(int minors)
 }
 
 struct gendisk *alloc_disk_node(int minors, int node_id)
+{
+	return alloc_disk_ext_node(minors, 0, node_id);
+}
+
+struct gendisk *alloc_disk_ext(int minors, int ext_minors)
+{
+	return alloc_disk_ext_node(minors, ext_minors, -1);
+}
+
+struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 {
 	struct gendisk *disk;
 
 	disk = kmalloc_node(sizeof(struct gendisk),
 				GFP_KERNEL | __GFP_ZERO, node_id);
 	if (disk) {
+		int tot_minors = minors + ext_minors;
+
 		if (!init_disk_stats(disk)) {
 			kfree(disk);
 			return NULL;
 		}
-		if (minors > 1) {
-			int size = (minors - 1) * sizeof(struct hd_struct *);
+		if (tot_minors > 1) {
+			int size = (tot_minors - 1) * sizeof(struct hd_struct *);
 			disk->__part = kmalloc_node(size,
 				GFP_KERNEL | __GFP_ZERO, node_id);
 			if (!disk->__part) {
@@ -898,6 +1005,7 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 			}
 		}
 		disk->minors = minors;
+		disk->ext_minors = ext_minors;
 		rand_initialize_disk(disk);
 		disk->dev.class = &block_class;
 		disk->dev.type = &disk_type;
@@ -910,6 +1018,8 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
 
 EXPORT_SYMBOL(alloc_disk);
 EXPORT_SYMBOL(alloc_disk_node);
+EXPORT_SYMBOL(alloc_disk_ext);
+EXPORT_SYMBOL(alloc_disk_ext_node);
 
 struct kobject *get_disk(struct gendisk *disk)
 {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index c442f0aadac3..0d4b7f28f13f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -333,6 +333,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	if (!part)
 		return;
 
+	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(disk->__part[partno-1], NULL);
 	kobject_put(part->holder_dir);
 	device_del(&part->dev);
@@ -352,6 +353,7 @@ int add_partition(struct gendisk *disk, int partno,
 		  sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
+	dev_t devt = MKDEV(0, 0);
 	int err;
 
 	if (disk->__part[partno - 1])
@@ -378,11 +380,15 @@ int add_partition(struct gendisk *disk, int partno,
 			 "%s%d", disk->dev.bus_id, partno);
 
 	device_initialize(&p->dev);
-	p->dev.devt = MKDEV(disk->major, disk->first_minor + partno);
 	p->dev.class = &block_class;
 	p->dev.type = &part_type;
 	p->dev.parent = &disk->dev;
 
+	err = blk_alloc_devt(p, &devt);
+	if (err)
+		goto out_put;
+	p->dev.devt = devt;
+
 	/* delay uevent until 'holders' subdir is created */
 	p->dev.uevent_suppress = 1;
 	err = device_add(&p->dev);
@@ -419,6 +425,7 @@ out_del:
 	device_del(&p->dev);
 out_put:
 	put_device(&p->dev);
+	blk_free_devt(devt);
 	return err;
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ac8a901f2002..6fc532424062 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -113,13 +113,15 @@ struct hd_struct {
 #define GENHD_FL_FAIL				64
 
 struct gendisk {
-	/* major, first_minor and minors are input parameters only,
-	 * don't use directly.  Use disk_devt() and disk_max_parts().
+	/* major, first_minor, minors and ext_minors are input
+	 * parameters only, don't use directly.  Use disk_devt() and
+	 * disk_max_parts().
 	 */
 	int major;			/* major number of driver */
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
+	int ext_minors;			/* number of extended dynamic minors */
 
 	char disk_name[32];		/* name of major driver */
 
@@ -167,7 +169,7 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 
 static inline int disk_max_parts(struct gendisk *disk)
 {
-	return disk->minors - 1;
+	return disk->minors + disk->ext_minors - 1;
 }
 
 static inline dev_t disk_devt(struct gendisk *disk)
@@ -554,6 +556,8 @@ struct unixware_disklabel {
 #define ADDPART_FLAG_RAID	1
 #define ADDPART_FLAG_WHOLEDISK	2
 
+extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
+extern void blk_free_devt(dev_t devt);
 extern dev_t blk_lookup_devt(const char *name, int partno);
 extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
@@ -564,6 +568,9 @@ extern void printk_all_partitions(void);
 
 extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
+extern struct gendisk *alloc_disk_ext_node(int minors, int ext_minrs,
+					   int node_id);
+extern struct gendisk *alloc_disk_ext(int minors, int ext_minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,
diff --git a/include/linux/major.h b/include/linux/major.h
index 53d5fafd85c3..88249452b935 100644
--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -170,4 +170,6 @@
 
 #define VIOTAPE_MAJOR		230
 
+#define BLOCK_EXT_MAJOR		259
+
 #endif

From 1f0142905d4812966831613847db38a66da29eb8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:23 +0900
Subject: [PATCH 039/132] block: adjust formatting for large minors and add
 ext_range sysfs attr

With extended minors and the soon-to-follow debug feature, large minor
numbers for block devices will be common.  This patch does the
followings to make printouts pretty.

* Adapt print formats such that large minors don't break the
  formatting.

* For extended MAJ:MIN, %02x%02x for MAJ:MIN used in
  printk_all_partitions() doesn't cut it anymore.  Update it such that
  %03x:%05x is used if either MAJ or MIN doesn't fit in %02x.

* Implement ext_range sysfs attribute which shows total minors the
  device can use including both conventional minor space and the
  extended one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c      | 45 ++++++++++++++++++++++++++++++++++-----------
 include/linux/fs.h |  1 +
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 7bbfed05cecb..ee4b13520e59 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -366,6 +366,18 @@ void blk_free_devt(dev_t devt)
 	}
 }
 
+static char *bdevt_str(dev_t devt, char *buf)
+{
+	if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
+		char tbuf[BDEVT_SIZE];
+		snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
+		snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
+	} else
+		snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
+
+	return buf;
+}
+
 /*
  * Register device numbers dev..(dev+range-1)
  * range must be nonzero
@@ -521,7 +533,8 @@ void __init printk_all_partitions(void)
 		struct gendisk *disk = dev_to_disk(dev);
 		struct disk_part_iter piter;
 		struct hd_struct *part;
-		char buf[BDEVNAME_SIZE];
+		char name_buf[BDEVNAME_SIZE];
+		char devt_buf[BDEVT_SIZE];
 
 		/*
 		 * Don't show empty devices or things that have been
@@ -536,10 +549,10 @@ void __init printk_all_partitions(void)
 		 * numbers in hex - the same format as the root=
 		 * option takes.
 		 */
-		printk("%02x%02x %10llu %s",
-		       MAJOR(disk_devt(disk)), MINOR(disk_devt(disk)),
+		printk("%s %10llu %s",
+		       bdevt_str(disk_devt(disk), devt_buf),
 		       (unsigned long long)get_capacity(disk) >> 1,
-		       disk_name(disk, 0, buf));
+		       disk_name(disk, 0, name_buf));
 		if (disk->driverfs_dev != NULL &&
 		    disk->driverfs_dev->driver != NULL)
 			printk(" driver: %s\n",
@@ -550,10 +563,10 @@ void __init printk_all_partitions(void)
 		/* now show the partitions */
 		disk_part_iter_init(&piter, disk, 0);
 		while ((part = disk_part_iter_next(&piter)))
-			printk("  %02x%02x %10llu %s\n",
-			       MAJOR(part_devt(part)), MINOR(part_devt(part)),
+			printk("  %s %10llu %s\n",
+			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
-			       disk_name(disk, part->partno, buf));
+			       disk_name(disk, part->partno, name_buf));
 		disk_part_iter_exit(&piter);
 	}
 	class_dev_iter_exit(&iter);
@@ -630,14 +643,14 @@ static int show_partition(struct seq_file *seqf, void *v)
 		return 0;
 
 	/* show the full disk and all non-0 size partitions of it */
-	seq_printf(seqf, "%4d  %4d %10llu %s\n",
+	seq_printf(seqf, "%4d  %7d %10llu %s\n",
 		MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
 		(unsigned long long)get_capacity(sgp) >> 1,
 		disk_name(sgp, 0, buf));
 
 	disk_part_iter_init(&piter, sgp, 0);
 	while ((part = disk_part_iter_next(&piter)))
-		seq_printf(seqf, "%4d  %4d %10llu %s\n",
+		seq_printf(seqf, "%4d  %7d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
 			   (unsigned long long)part->nr_sects >> 1,
 			   disk_name(sgp, part->partno, buf));
@@ -691,6 +704,14 @@ static ssize_t disk_range_show(struct device *dev,
 	return sprintf(buf, "%d\n", disk->minors);
 }
 
+static ssize_t disk_ext_range_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", disk_max_parts(disk) + 1);
+}
+
 static ssize_t disk_removable_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -780,6 +801,7 @@ static ssize_t disk_fail_store(struct device *dev,
 #endif
 
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
+static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL);
@@ -792,6 +814,7 @@ static struct device_attribute dev_attr_fail =
 
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
+	&dev_attr_ext_range.attr,
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
@@ -858,7 +881,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	cpu = disk_stat_lock();
 	disk_round_stats(cpu, gp);
 	disk_stat_unlock();
-	seq_printf(seqf, "%4d %4d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
+	seq_printf(seqf, "%4d %7d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
 		MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)),
 		disk_name(gp, 0, buf),
 		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
@@ -877,7 +900,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		cpu = disk_stat_lock();
 		part_round_stats(cpu, hd);
 		disk_stat_unlock();
-		seq_printf(seqf, "%4d %4d %s %lu %lu %llu "
+		seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 860689f541b1..02a9fb5a830c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1685,6 +1685,7 @@ extern void chrdev_show(struct seq_file *,off_t);
 
 /* fs/block_dev.c */
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
+#define BDEVT_SIZE	10	/* Largest string for MAJ:MIN for blkdev */
 
 #ifdef CONFIG_BLOCK
 #define BLKDEV_MAJOR_HASH_SIZE	255

From f615b48cc7df7cac3865ec76ac1a5bb04d3e07f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:24 +0900
Subject: [PATCH 040/132] sd/ide-disk: apply extended minors to sd and ide

Update sd and ide-disk such that they can take advantage of extended
minors.

ide-disk already has 64 minors per device and currently doesn't use
extended minors although after this patch it can be turned on by
simply tweaking constants.

sd only had 16 minors per device causing problems on certain peculiar
configurations.  This patch lifts the restriction and enables it to
use upto 64 minors.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-disk.c | 11 ++++++++---
 drivers/scsi/sd.c      |  9 +++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 07ef88bd109b..7a88de9ada29 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -41,6 +41,10 @@
 #include <asm/io.h>
 #include <asm/div64.h>
 
+#define IDE_DISK_PARTS		(1 << PARTN_BITS)
+#define IDE_DISK_MINORS		IDE_DISK_PARTS
+#define IDE_DISK_EXT_MINORS	(IDE_DISK_PARTS - IDE_DISK_MINORS)
+
 struct ide_disk_obj {
 	ide_drive_t	*drive;
 	ide_driver_t	*driver;
@@ -1151,8 +1155,8 @@ static int ide_disk_probe(ide_drive_t *drive)
 	if (!idkp)
 		goto failed;
 
-	g = alloc_disk_node(1 << PARTN_BITS,
-			hwif_to_node(drive->hwif));
+	g = alloc_disk_ext_node(IDE_DISK_MINORS, IDE_DISK_EXT_MINORS,
+				hwif_to_node(drive->hwif));
 	if (!g)
 		goto out_free_idkp;
 
@@ -1178,7 +1182,8 @@ static int ide_disk_probe(ide_drive_t *drive)
 	} else
 		drive->attach = 1;
 
-	g->minors = 1 << PARTN_BITS;
+	g->minors = IDE_DISK_MINORS;
+	g->ext_minors = IDE_DISK_EXT_MINORS;
 	g->driverfs_dev = &drive->gendev;
 	g->flags = drive->removable ? GENHD_FL_REMOVABLE : 0;
 	set_capacity(g, idedisk_capacity(drive));
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e5e7d7856454..d1bb0e1d2d28 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -86,6 +86,10 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
 
+#define SD_PARTS	64
+#define SD_MINORS	16
+#define SD_EXT_MINORS	(SD_PARTS - SD_MINORS)
+
 static int  sd_revalidate_disk(struct gendisk *);
 static int  sd_probe(struct device *);
 static int  sd_remove(struct device *);
@@ -1801,7 +1805,7 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
-	gd = alloc_disk(16);
+	gd = alloc_disk_ext(SD_MINORS, SD_EXT_MINORS);
 	if (!gd)
 		goto out_free;
 
@@ -1845,7 +1849,8 @@ static int sd_probe(struct device *dev)
 
 	gd->major = sd_major((index & 0xf0) >> 4);
 	gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
-	gd->minors = 16;
+	gd->minors = SD_MINORS;
+	gd->ext_minors = SD_EXT_MINORS;
 	gd->fops = &sd_fops;
 
 	if (index < 26) {

From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:47:25 +0900
Subject: [PATCH 041/132] block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT

Extended devt introduces non-contiguos device numbers.  This patch
implements a debug option which forces most devt allocations to be
from the extended area and spreads them out.  This is enabled by
default if DEBUG_KERNEL is set and achieves...

1. Detects code paths in kernel or userland which expect predetermined
   consecutive device numbers.

2. When something goes wrong, avoid corruption as adding to the minor
   of earlier partition won't lead to the wrong but valid device.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c          | 38 +++++++++++++++++++++++++++++++++++---
 drivers/ide/ide-disk.c |  6 ++++++
 drivers/scsi/sd.c      |  6 ++++++
 lib/Kconfig.debug      | 16 ++++++++++++++++
 4 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index ee4b13520e59..67e5a59ced2a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -298,6 +298,38 @@ EXPORT_SYMBOL(unregister_blkdev);
 
 static struct kobj_map *bdev_map;
 
+/**
+ * blk_mangle_minor - scatter minor numbers apart
+ * @minor: minor number to mangle
+ *
+ * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
+ * is enabled.  Mangling twice gives the original value.
+ *
+ * RETURNS:
+ * Mangled value.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+static int blk_mangle_minor(int minor)
+{
+#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
+	int i;
+
+	for (i = 0; i < MINORBITS / 2; i++) {
+		int low = minor & (1 << i);
+		int high = minor & (1 << (MINORBITS - 1 - i));
+		int distance = MINORBITS - 1 - 2 * i;
+
+		minor ^= low | high;	/* clear both bits */
+		low <<= distance;	/* swap the positions */
+		high >>= distance;
+		minor |= low | high;	/* and set */
+	}
+#endif
+	return minor;
+}
+
 /**
  * blk_alloc_devt - allocate a dev_t for a partition
  * @part: partition to allocate dev_t for
@@ -339,7 +371,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
 		return -EBUSY;
 	}
 
-	*devt = MKDEV(BLOCK_EXT_MAJOR, idx);
+	*devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
 	return 0;
 }
 
@@ -361,7 +393,7 @@ void blk_free_devt(dev_t devt)
 
 	if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
 		mutex_lock(&ext_devt_mutex);
-		idr_remove(&ext_devt_idr, MINOR(devt));
+		idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 		mutex_unlock(&ext_devt_mutex);
 	}
 }
@@ -473,7 +505,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
 		struct hd_struct *part;
 
 		mutex_lock(&ext_devt_mutex);
-		part = idr_find(&ext_devt_idr, MINOR(devt));
+		part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 		if (part && get_disk(part_to_disk(part))) {
 			*partno = part->partno;
 			disk = part_to_disk(part);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 7a88de9ada29..a072df5053ae 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -42,7 +42,13 @@
 #include <asm/div64.h>
 
 #define IDE_DISK_PARTS		(1 << PARTN_BITS)
+
+#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define IDE_DISK_MINORS		IDE_DISK_PARTS
+#else
+#define IDE_DISK_MINORS		1
+#endif
+
 #define IDE_DISK_EXT_MINORS	(IDE_DISK_PARTS - IDE_DISK_MINORS)
 
 struct ide_disk_obj {
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index d1bb0e1d2d28..280d231a86ed 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -87,7 +87,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
 
 #define SD_PARTS	64
+
+#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define SD_MINORS	16
+#else
+#define SD_MINORS	1
+#endif
+
 #define SD_EXT_MINORS	(SD_PARTS - SD_MINORS)
 
 static int  sd_revalidate_disk(struct gendisk *);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0b504814e378..5a536f703a83 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -624,6 +624,22 @@ config BACKTRACE_SELF_TEST
 
 	  Say N if you are unsure.
 
+config DEBUG_BLOCK_EXT_DEVT
+        bool "Force extended block device numbers and spread them"
+	depends on DEBUG_KERNEL
+	depends on BLOCK
+	default y
+	help
+	  Conventionally, block device numbers are allocated from
+	  predetermined contiguous area.  However, extended block area
+	  may introduce non-contiguous block device numbers.  This
+	  option forces most block device numbers to be allocated from
+	  the extended space and spreads them to discover kernel or
+	  userland code paths which assume predetermined contiguous
+	  device number allocation.
+
+	  Say N if you are unsure.
+
 config LKDTM
 	tristate "Linux Kernel Dump Test Tool Module"
 	depends on DEBUG_KERNEL

From ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:05 +0900
Subject: [PATCH 042/132] block: implement and use {disk|part}_to_dev()

Implement {disk|part}_to_dev() and use them to access generic device
instead of directly dereferencing {disk|part}->dev.  To make sure no
user is left behind, rename generic devices fields to __dev.

This is in preparation of unifying partition 0 handling with other
partitions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c      |  5 ++-
 block/blk-sysfs.c          |  4 +-
 block/genhd.c              | 27 ++++++-------
 drivers/block/aoe/aoeblk.c |  4 +-
 drivers/block/nbd.c        |  4 +-
 drivers/ide/ide-probe.c    |  2 +-
 drivers/md/dm.c            |  4 +-
 drivers/md/md.c            | 10 ++---
 fs/block_dev.c             |  4 +-
 fs/partitions/check.c      | 79 ++++++++++++++++++++------------------
 include/linux/genhd.h      | 20 +++++-----
 11 files changed, 86 insertions(+), 77 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d87606eaca1d..69023da63151 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -331,7 +331,8 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 			return -1;
 
 		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
-					 &disk->dev.kobj, "%s", "integrity")) {
+					 &disk_to_dev(disk)->kobj,
+					 "%s", "integrity")) {
 			kmem_cache_free(integrity_cachep, bi);
 			return -1;
 		}
@@ -375,7 +376,7 @@ void blk_integrity_unregister(struct gendisk *disk)
 
 	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
 	kobject_del(&bi->kobj);
-	kobject_put(&disk->dev.kobj);
+	kobject_put(&disk_to_dev(disk)->kobj);
 	kmem_cache_free(integrity_cachep, bi);
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 304ec73ab821..b9a6ed166649 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -310,7 +310,7 @@ int blk_register_queue(struct gendisk *disk)
 	if (!q->request_fn)
 		return 0;
 
-	ret = kobject_add(&q->kobj, kobject_get(&disk->dev.kobj),
+	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
 			  "%s", "queue");
 	if (ret < 0)
 		return ret;
@@ -339,6 +339,6 @@ void blk_unregister_queue(struct gendisk *disk)
 
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
-		kobject_put(&disk->dev.kobj);
+		kobject_put(&disk_to_dev(disk)->kobj);
 	}
 }
diff --git a/block/genhd.c b/block/genhd.c
index 67e5a59ced2a..0a2f16bd54b7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -59,7 +59,7 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
 	rcu_read_lock();
 	part = rcu_dereference(disk->__part[partno - 1]);
 	if (part)
-		get_device(&part->dev);
+		get_device(part_to_dev(part));
 	rcu_read_unlock();
 
 	return part;
@@ -130,7 +130,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 		if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
 			continue;
 
-		get_device(&part->dev);
+		get_device(part_to_dev(part));
 		piter->part = part;
 		piter->idx += inc;
 		break;
@@ -435,7 +435,7 @@ static struct kobject *exact_match(dev_t devt, int *partno, void *data)
 {
 	struct gendisk *p = data;
 
-	return &p->dev.kobj;
+	return &disk_to_dev(p)->kobj;
 }
 
 static int exact_lock(dev_t devt, void *data)
@@ -460,7 +460,7 @@ void add_disk(struct gendisk *disk)
 	int retval;
 
 	disk->flags |= GENHD_FL_UP;
-	disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+	disk_to_dev(disk)->devt = MKDEV(disk->major, disk->first_minor);
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
 	register_disk(disk);
@@ -468,7 +468,8 @@ void add_disk(struct gendisk *disk)
 
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, disk_devt(disk));
-	retval = sysfs_create_link(&disk->dev.kobj, &bdi->dev->kobj, "bdi");
+	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
+				   "bdi");
 	WARN_ON(retval);
 }
 
@@ -477,7 +478,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
 void unlink_gendisk(struct gendisk *disk)
 {
-	sysfs_remove_link(&disk->dev.kobj, "bdi");
+	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
@@ -903,7 +904,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	int cpu;
 
 	/*
-	if (&gp->dev.kobj.entry == block_class.devices.next)
+	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
 		seq_puts(seqf,	"major minor name"
 				"     rio rmerge rsect ruse wio wmerge "
 				"wsect wuse running use aveq"
@@ -972,7 +973,7 @@ static void media_change_notify_thread(struct work_struct *work)
 	 * set enviroment vars to indicate which event this is for
 	 * so that user space will know to go check the media status.
 	 */
-	kobject_uevent_env(&gd->dev.kobj, KOBJ_CHANGE, envp);
+	kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
 	put_device(gd->driverfs_dev);
 }
 
@@ -1062,9 +1063,9 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 		disk->minors = minors;
 		disk->ext_minors = ext_minors;
 		rand_initialize_disk(disk);
-		disk->dev.class = &block_class;
-		disk->dev.type = &disk_type;
-		device_initialize(&disk->dev);
+		disk_to_dev(disk)->class = &block_class;
+		disk_to_dev(disk)->type = &disk_type;
+		device_initialize(disk_to_dev(disk));
 		INIT_WORK(&disk->async_notify,
 			media_change_notify_thread);
 	}
@@ -1086,7 +1087,7 @@ struct kobject *get_disk(struct gendisk *disk)
 	owner = disk->fops->owner;
 	if (owner && !try_module_get(owner))
 		return NULL;
-	kobj = kobject_get(&disk->dev.kobj);
+	kobj = kobject_get(&disk_to_dev(disk)->kobj);
 	if (kobj == NULL) {
 		module_put(owner);
 		return NULL;
@@ -1100,7 +1101,7 @@ EXPORT_SYMBOL(get_disk);
 void put_disk(struct gendisk *disk)
 {
 	if (disk)
-		kobject_put(&disk->dev.kobj);
+		kobject_put(&disk_to_dev(disk)->kobj);
 }
 
 EXPORT_SYMBOL(put_disk);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 0c39782b2660..3edb6cb7d68f 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -109,12 +109,12 @@ static const struct attribute_group attr_group = {
 static int
 aoedisk_add_sysfs(struct aoedev *d)
 {
-	return sysfs_create_group(&d->gd->dev.kobj, &attr_group);
+	return sysfs_create_group(&disk_to_dev(d->gd)->kobj, &attr_group);
 }
 void
 aoedisk_rm_sysfs(struct aoedev *d)
 {
-	sysfs_remove_group(&d->gd->dev.kobj, &attr_group);
+	sysfs_remove_group(&disk_to_dev(d->gd)->kobj, &attr_group);
 }
 
 static int
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 1778e4a2c672..7b3351260d56 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -403,7 +403,7 @@ static int nbd_do_it(struct nbd_device *lo)
 	BUG_ON(lo->magic != LO_MAGIC);
 
 	lo->pid = current->pid;
-	ret = sysfs_create_file(&lo->disk->dev.kobj, &pid_attr.attr);
+	ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
 	if (ret) {
 		printk(KERN_ERR "nbd: sysfs_create_file failed!");
 		return ret;
@@ -412,7 +412,7 @@ static int nbd_do_it(struct nbd_device *lo)
 	while ((req = nbd_read_stat(lo)) != NULL)
 		nbd_end_request(req);
 
-	sysfs_remove_file(&lo->disk->dev.kobj, &pid_attr.attr);
+	sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr);
 	return 0;
 }
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index a51a30e9eab3..70aa86c8807e 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1188,7 +1188,7 @@ static struct kobject *exact_match(dev_t dev, int *part, void *data)
 {
 	struct gendisk *p = data;
 	*part &= (1 << PARTN_BITS) - 1;
-	return &p->dev.kobj;
+	return &disk_to_dev(p)->kobj;
 }
 
 static int exact_lock(dev_t dev, void *data)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 653624792eaf..637806695bb9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1186,7 +1186,7 @@ static void event_callback(void *context)
 	list_splice_init(&md->uevent_list, &uevents);
 	spin_unlock_irqrestore(&md->uevent_lock, flags);
 
-	dm_send_uevents(&uevents, &md->disk->dev.kobj);
+	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
 
 	atomic_inc(&md->event_nr);
 	wake_up(&md->eventq);
@@ -1643,7 +1643,7 @@ out:
  *---------------------------------------------------------------*/
 void dm_kobject_uevent(struct mapped_device *md)
 {
-	kobject_uevent(&md->disk->dev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
 }
 
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index deeac4b44173..96e9fccd2eab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1465,9 +1465,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 		goto fail;
 
 	if (rdev->bdev->bd_part)
-		ko = &rdev->bdev->bd_part->dev.kobj;
+		ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
 	else
-		ko = &rdev->bdev->bd_disk->dev.kobj;
+		ko = &disk_to_dev(rdev->bdev->bd_disk)->kobj;
 	if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
 		kobject_del(&rdev->kobj);
 		goto fail;
@@ -3470,8 +3470,8 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
 	disk->queue = mddev->queue;
 	add_disk(disk);
 	mddev->gendisk = disk;
-	error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
-				     "%s", "md");
+	error = kobject_init_and_add(&mddev->kobj, &md_ktype,
+				     &disk_to_dev(disk)->kobj, "%s", "md");
 	mutex_unlock(&disks_mutex);
 	if (error)
 		printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
@@ -3761,7 +3761,7 @@ static int do_md_run(mddev_t * mddev)
 	sysfs_notify(&mddev->kobj, NULL, "array_state");
 	sysfs_notify(&mddev->kobj, NULL, "sync_action");
 	sysfs_notify(&mddev->kobj, NULL, "degraded");
-	kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
 	return 0;
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2f2873b9a041..a02df22f37c3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -543,9 +543,9 @@ EXPORT_SYMBOL(bd_release);
 static struct kobject *bdev_get_kobj(struct block_device *bdev)
 {
 	if (bdev->bd_contains != bdev)
-		return kobject_get(&bdev->bd_part->dev.kobj);
+		return kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	else
-		return kobject_get(&bdev->bd_disk->dev.kobj);
+		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
 }
 
 static struct kobject *bdev_get_holder(struct block_device *bdev)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0d4b7f28f13f..ac0df3acdcda 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -309,7 +309,7 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
 {
 	struct kobject *k;
 
-	k = kobject_get(&disk->dev.kobj);
+	k = kobject_get(&disk_to_dev(disk)->kobj);
 	disk->holder_dir = kobject_create_and_add("holders", k);
 	disk->slave_dir = kobject_create_and_add("slaves", k);
 	kobject_put(k);
@@ -322,7 +322,7 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 	part->start_sect = 0;
 	part->nr_sects = 0;
 	part_stat_set_all(part, 0);
-	put_device(&part->dev);
+	put_device(part_to_dev(part));
 }
 
 void delete_partition(struct gendisk *disk, int partno)
@@ -336,7 +336,7 @@ void delete_partition(struct gendisk *disk, int partno)
 	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(disk->__part[partno-1], NULL);
 	kobject_put(part->holder_dir);
-	device_del(&part->dev);
+	device_del(part_to_dev(part));
 
 	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
 }
@@ -354,6 +354,9 @@ int add_partition(struct gendisk *disk, int partno,
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
+	struct device *ddev = disk_to_dev(disk);
+	struct device *pdev;
+	const char *dname;
 	int err;
 
 	if (disk->__part[partno - 1])
@@ -367,42 +370,43 @@ int add_partition(struct gendisk *disk, int partno,
 		err = -ENOMEM;
 		goto out_free;
 	}
+	pdev = part_to_dev(p);
+
 	p->start_sect = start;
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = disk->policy;
 
-	if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-		"%sp%d", disk->dev.bus_id, partno);
+	dname = dev_name(ddev);
+	if (isdigit(dname[strlen(dname) - 1]))
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
 	else
-		snprintf(p->dev.bus_id, BUS_ID_SIZE,
-			 "%s%d", disk->dev.bus_id, partno);
+		snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
 
-	device_initialize(&p->dev);
-	p->dev.class = &block_class;
-	p->dev.type = &part_type;
-	p->dev.parent = &disk->dev;
+	device_initialize(pdev);
+	pdev->class = &block_class;
+	pdev->type = &part_type;
+	pdev->parent = ddev;
 
 	err = blk_alloc_devt(p, &devt);
 	if (err)
-		goto out_put;
-	p->dev.devt = devt;
+		goto out_free;
+	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	p->dev.uevent_suppress = 1;
-	err = device_add(&p->dev);
+	pdev->uevent_suppress = 1;
+	err = device_add(pdev);
 	if (err)
 		goto out_put;
 
 	err = -ENOMEM;
-	p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj);
+	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
 	if (!p->holder_dir)
 		goto out_del;
 
-	p->dev.uevent_suppress = 0;
+	pdev->uevent_suppress = 0;
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
-		err = device_create_file(&p->dev, &dev_attr_whole_disk);
+		err = device_create_file(pdev, &dev_attr_whole_disk);
 		if (err)
 			goto out_del;
 	}
@@ -412,8 +416,8 @@ int add_partition(struct gendisk *disk, int partno,
 	rcu_assign_pointer(disk->__part[partno - 1], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!disk->dev.uevent_suppress)
-		kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+	if (!ddev->uevent_suppress)
+		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
 	return 0;
 
@@ -422,9 +426,9 @@ out_free:
 	return err;
 out_del:
 	kobject_put(p->holder_dir);
-	device_del(&p->dev);
+	device_del(pdev);
 out_put:
-	put_device(&p->dev);
+	put_device(pdev);
 	blk_free_devt(devt);
 	return err;
 }
@@ -432,30 +436,31 @@ out_put:
 /* Not exported, helper to add_disk(). */
 void register_disk(struct gendisk *disk)
 {
+	struct device *ddev = disk_to_dev(disk);
 	struct block_device *bdev;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	char *s;
 	int err;
 
-	disk->dev.parent = disk->driverfs_dev;
+	ddev->parent = disk->driverfs_dev;
 
-	strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE);
+	strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
 	/* ewww... some of these buggers have / in the name... */
-	s = strchr(disk->dev.bus_id, '/');
+	s = strchr(ddev->bus_id, '/');
 	if (s)
 		*s = '!';
 
 	/* delay uevents, until we scanned partition table */
-	disk->dev.uevent_suppress = 1;
+	ddev->uevent_suppress = 1;
 
-	if (device_add(&disk->dev))
+	if (device_add(ddev))
 		return;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	err = sysfs_create_link(block_depr, &disk->dev.kobj,
-				kobject_name(&disk->dev.kobj));
+	err = sysfs_create_link(block_depr, &ddev->kobj,
+				kobject_name(&ddev->kobj));
 	if (err) {
-		device_del(&disk->dev);
+		device_del(ddev);
 		return;
 	}
 #endif
@@ -481,13 +486,13 @@ void register_disk(struct gendisk *disk)
 
 exit:
 	/* announce disk after possible partitions are created */
-	disk->dev.uevent_suppress = 0;
-	kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+	ddev->uevent_suppress = 0;
+	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
 	disk_part_iter_init(&piter, disk, 0);
 	while ((part = disk_part_iter_next(&piter)))
-		kobject_uevent(&part->dev.kobj, KOBJ_ADD);
+		kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
 	disk_part_iter_exit(&piter);
 }
 
@@ -518,7 +523,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		return -EIO;
 
 	/* tell userspace that the media / partition table may have changed */
-	kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
@@ -591,7 +596,7 @@ void del_gendisk(struct gendisk *disk)
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
-	sysfs_remove_link(block_depr, disk->dev.bus_id);
+	sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
 #endif
-	device_del(&disk->dev);
+	device_del(disk_to_dev(disk));
 }
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 6fc532424062..e4e18c509ac5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -15,9 +15,11 @@
 
 #ifdef CONFIG_BLOCK
 
-#define kobj_to_dev(k) container_of(k, struct device, kobj)
-#define dev_to_disk(device) container_of(device, struct gendisk, dev)
-#define dev_to_part(device) container_of(device, struct hd_struct, dev)
+#define kobj_to_dev(k)		container_of((k), struct device, kobj)
+#define dev_to_disk(device)	container_of((device), struct gendisk, __dev)
+#define dev_to_part(device)	container_of((device), struct hd_struct, __dev)
+#define disk_to_dev(disk)	(&((disk)->__dev))
+#define part_to_dev(part)	(&((part)->__dev))
 
 extern struct device_type part_type;
 extern struct kobject *block_depr;
@@ -88,7 +90,7 @@ struct disk_stats {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
-	struct device dev;
+	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -139,7 +141,7 @@ struct gendisk {
 
 	int flags;
 	struct device *driverfs_dev;  // FIXME: remove
-	struct device dev;
+	struct device __dev;
 	struct kobject *holder_dir;
 	struct kobject *slave_dir;
 
@@ -163,7 +165,7 @@ struct gendisk {
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
 {
 	if (likely(part))
-		return dev_to_disk((part)->dev.parent);
+		return dev_to_disk(part_to_dev(part)->parent);
 	return NULL;
 }
 
@@ -174,12 +176,12 @@ static inline int disk_max_parts(struct gendisk *disk)
 
 static inline dev_t disk_devt(struct gendisk *disk)
 {
-	return disk->dev.devt;
+	return disk_to_dev(disk)->devt;
 }
 
 static inline dev_t part_devt(struct hd_struct *part)
 {
-	return part->dev.devt;
+	return part_to_dev(part)->devt;
 }
 
 extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
@@ -187,7 +189,7 @@ extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
 static inline void disk_put_part(struct hd_struct *part)
 {
 	if (likely(part))
-		put_device(&part->dev);
+		put_device(part_to_dev(part));
 }
 
 /*

From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 3 Sep 2008 09:06:42 +0200
Subject: [PATCH 043/132] block: introduce partition 0

genhd and partition code handled disk and partitions separately.  All
information about the whole disk was in struct genhd and partitions in
struct hd_struct.  However, the whole disk (part0) and other
partitions have a lot in common and the data structures end up having
good number of common fields and thus separate code paths doing the
same thing.  Also, the partition array was indexed by partno - 1 which
gets pretty confusing at times.

This patch introduces partition 0 and makes the partition array
indexed by partno.  Following patches will unify the handling of disk
and parts piece-by-piece.

This patch also implements disk_partitionable() which tests whether a
disk is partitionable.  With coming dynamic partition array change,
the most common usage of disk_max_parts() will be testing whether a
disk is partitionable and the number of max partitions will become
much less important.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 40 +++++++++++++++++++++++-----------------
 block/ioctl.c         |  4 ++--
 fs/block_dev.c        |  2 +-
 fs/partitions/check.c | 12 ++++++------
 include/linux/genhd.h | 11 +++++++++--
 5 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 0a2f16bd54b7..65b7386c26d8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -54,10 +54,10 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
 {
 	struct hd_struct *part;
 
-	if (unlikely(partno < 1 || partno > disk_max_parts(disk)))
+	if (unlikely(partno < 0 || partno >= disk_max_parts(disk)))
 		return NULL;
 	rcu_read_lock();
-	part = rcu_dereference(disk->__part[partno - 1]);
+	part = rcu_dereference(disk->__part[partno]);
 	if (part)
 		get_device(part_to_dev(part));
 	rcu_read_unlock();
@@ -85,8 +85,10 @@ void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 
 	if (flags & DISK_PITER_REVERSE)
 		piter->idx = disk_max_parts(piter->disk) - 1;
-	else
+	else if (flags & DISK_PITER_INCL_PART0)
 		piter->idx = 0;
+	else
+		piter->idx = 1;
 
 	piter->flags = flags;
 }
@@ -114,7 +116,10 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 	/* determine iteration parameters */
 	if (piter->flags & DISK_PITER_REVERSE) {
 		inc = -1;
-		end = -1;
+		if (piter->flags & DISK_PITER_INCL_PART0)
+			end = -1;
+		else
+			end = 0;
 	} else {
 		inc = 1;
 		end = disk_max_parts(piter->disk);
@@ -177,7 +182,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
 	int i;
 
-	for (i = 0; i < disk_max_parts(disk); i++) {
+	for (i = 1; i < disk_max_parts(disk); i++) {
 		struct hd_struct *part = rcu_dereference(disk->__part[i]);
 
 		if (part && part->start_sect <= sector &&
@@ -669,7 +674,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	char buf[BDEVNAME_SIZE];
 
 	/* Don't show non-partitionable removeable devices or empty devices */
-	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
+	if (!get_capacity(sgp) || (!disk_partitionable(sgp) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
@@ -742,7 +747,7 @@ static ssize_t disk_ext_range_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	return sprintf(buf, "%d\n", disk_max_parts(disk) + 1);
+	return sprintf(buf, "%d\n", disk_max_parts(disk));
 }
 
 static ssize_t disk_removable_show(struct device *dev,
@@ -998,7 +1003,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
 
 		if (strcmp(dev->bus_id, name))
 			continue;
-		if (partno < 0 || partno > disk_max_parts(disk))
+		if (partno < 0 || partno >= disk_max_parts(disk))
 			continue;
 
 		if (partno == 0)
@@ -1045,21 +1050,22 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 				GFP_KERNEL | __GFP_ZERO, node_id);
 	if (disk) {
 		int tot_minors = minors + ext_minors;
+		int size = tot_minors * sizeof(struct hd_struct *);
 
 		if (!init_disk_stats(disk)) {
 			kfree(disk);
 			return NULL;
 		}
-		if (tot_minors > 1) {
-			int size = (tot_minors - 1) * sizeof(struct hd_struct *);
-			disk->__part = kmalloc_node(size,
-				GFP_KERNEL | __GFP_ZERO, node_id);
-			if (!disk->__part) {
-				free_disk_stats(disk);
-				kfree(disk);
-				return NULL;
-			}
+
+		disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO,
+					    node_id);
+		if (!disk->__part) {
+			free_disk_stats(disk);
+			kfree(disk);
+			return NULL;
 		}
+		disk->__part[0] = &disk->part0;
+
 		disk->minors = minors;
 		disk->ext_minors = ext_minors;
 		rand_initialize_disk(disk);
diff --git a/block/ioctl.c b/block/ioctl.c
index a5f672ad55f6..64e7c67a64b0 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -30,7 +30,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	if (bdev != bdev->bd_contains)
 		return -EINVAL;
 	partno = p.pno;
-	if (partno <= 0 || partno > disk_max_parts(disk))
+	if (partno <= 0 || partno >= disk_max_parts(disk))
 		return -EINVAL;
 	switch (a.op) {
 		case BLKPG_ADD_PARTITION:
@@ -102,7 +102,7 @@ static int blkdev_reread_part(struct block_device *bdev)
 	struct gendisk *disk = bdev->bd_disk;
 	int res;
 
-	if (!disk_max_parts(disk) || bdev != bdev->bd_contains)
+	if (!disk_partitionable(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a02df22f37c3..c982a9107979 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev)
 
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (disk_max_parts(bdev->bd_disk))
+	if (disk_partitionable(bdev->bd_disk))
 		bdev->bd_invalidated = 1;
 	return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ac0df3acdcda..b60699c271ac 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -173,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
 
-	state->limit = disk_max_parts(hd) + 1;
+	state->limit = disk_max_parts(hd);
 	i = res = err = 0;
 	while (!res && check_part[i]) {
 		memset(&state->parts, 0, sizeof(state->parts));
@@ -329,12 +329,12 @@ void delete_partition(struct gendisk *disk, int partno)
 {
 	struct hd_struct *part;
 
-	part = disk->__part[partno-1];
+	part = disk->__part[partno];
 	if (!part)
 		return;
 
 	blk_free_devt(part_devt(part));
-	rcu_assign_pointer(disk->__part[partno-1], NULL);
+	rcu_assign_pointer(disk->__part[partno], NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
@@ -359,7 +359,7 @@ int add_partition(struct gendisk *disk, int partno,
 	const char *dname;
 	int err;
 
-	if (disk->__part[partno - 1])
+	if (disk->__part[partno])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -413,7 +413,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	/* everything is up and running, commence */
 	INIT_RCU_HEAD(&p->rcu_head);
-	rcu_assign_pointer(disk->__part[partno - 1], p);
+	rcu_assign_pointer(disk->__part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!ddev->uevent_suppress)
@@ -467,7 +467,7 @@ void register_disk(struct gendisk *disk)
 	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
-	if (!disk_max_parts(disk))
+	if (!disk_partitionable(disk))
 		goto exit;
 
 	/* No such device (e.g., media were just removed) */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e4e18c509ac5..9e866a2aee50 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -127,12 +127,13 @@ struct gendisk {
 
 	char disk_name[32];		/* name of major driver */
 
-	/* Array of pointers to partitions indexed by partno - 1.
+	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other
 	 * non-critical accesses use RCU.  Always access through
 	 * helpers.
 	 */
 	struct hd_struct **__part;
+	struct hd_struct part0;
 
 	struct block_device_operations *fops;
 	struct request_queue *queue;
@@ -171,7 +172,12 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 
 static inline int disk_max_parts(struct gendisk *disk)
 {
-	return disk->minors + disk->ext_minors - 1;
+	return disk->minors + disk->ext_minors;
+}
+
+static inline bool disk_partitionable(struct gendisk *disk)
+{
+	return disk_max_parts(disk) > 1;
 }
 
 static inline dev_t disk_devt(struct gendisk *disk)
@@ -197,6 +203,7 @@ static inline void disk_put_part(struct hd_struct *part)
  */
 #define DISK_PITER_REVERSE	(1 << 0) /* iterate in the reverse direction */
 #define DISK_PITER_INCL_EMPTY	(1 << 1) /* include 0-sized parts */
+#define DISK_PITER_INCL_PART0	(1 << 2) /* include partition 0 */
 
 struct disk_part_iter {
 	struct gendisk		*disk;

From 80795aefb76d10c5d698e60c7e7750b5330787da Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:07 +0900
Subject: [PATCH 044/132] block: move capacity from disk to part0

Move disk->capacity to part0->nr_sects and convert all users who
directly accessed the field to use {get|set}_capacity().  This is done
early to allow the __dev field to be moved.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/aoe/aoeblk.c | 2 +-
 drivers/block/aoe/aoecmd.c | 4 ++--
 drivers/block/aoe/aoedev.c | 2 +-
 fs/partitions/check.c      | 2 +-
 include/linux/genhd.h      | 5 ++---
 5 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 3edb6cb7d68f..aa69556c3485 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -276,7 +276,7 @@ aoeblk_gdalloc(void *vp)
 	gd->first_minor = d->sysminor * AOE_PARTITIONS;
 	gd->fops = &aoe_bdops;
 	gd->private_data = d;
-	gd->capacity = d->ssize;
+	set_capacity(gd, d->ssize);
 	snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
 		d->aoemajor, d->aoeminor);
 
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 17eed8c025d0..934800f979c9 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -645,7 +645,7 @@ aoecmd_sleepwork(struct work_struct *work)
 		unsigned long flags;
 		u64 ssize;
 
-		ssize = d->gd->capacity;
+		ssize = get_capacity(d->gd);
 		bd = bdget_disk(d->gd, 0);
 
 		if (bd) {
@@ -707,7 +707,7 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
 	if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
 		return;
 	if (d->gd != NULL) {
-		d->gd->capacity = ssize;
+		set_capacity(d->gd, ssize);
 		d->flags |= DEVFL_NEWSIZE;
 	} else
 		d->flags |= DEVFL_GDALLOC;
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index a1d813ab0d6b..6a8038d115b5 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -91,7 +91,7 @@ aoedev_downdev(struct aoedev *d)
 	}
 
 	if (d->gd)
-		d->gd->capacity = 0;
+		set_capacity(d->gd, 0);
 
 	d->flags &= ~DEVFL_UP;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index b60699c271ac..902b95f1f9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -586,7 +586,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
-	disk->capacity = 0;
+	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
 	disk_stat_set_all(disk, 0);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9e866a2aee50..1cf828148ec6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -138,7 +138,6 @@ struct gendisk {
 	struct block_device_operations *fops;
 	struct request_queue *queue;
 	void *private_data;
-	sector_t capacity;
 
 	int flags;
 	struct device *driverfs_dev;  // FIXME: remove
@@ -411,11 +410,11 @@ static inline sector_t get_start_sect(struct block_device *bdev)
 }
 static inline sector_t get_capacity(struct gendisk *disk)
 {
-	return disk->capacity;
+	return disk->part0.nr_sects;
 }
 static inline void set_capacity(struct gendisk *disk, sector_t size)
 {
-	disk->capacity = size;
+	disk->part0.nr_sects = size;
 }
 
 #ifdef CONFIG_SOLARIS_X86_PARTITION

From 548b10eb2959c96cef6fc29fc96e0931eeb53bc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Aug 2008 09:01:47 +0200
Subject: [PATCH 045/132] block: move __dev from disk to part0

Move disk->__dev to part0->__dev.  This simplifies bdget_disk() and
lookup_devt() and allows common sysfs attributes to be unified.
part_to_disk() is updated to handle part0 -> disk.

Updated to include a fix from Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>,
he writes:

"part0 is a "special" partition and doesn't need to have capacity set - this
fixes regression caused by "block: move __dev from disk to part0" commit."

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 40 ++++++++++++----------------------------
 include/linux/genhd.h | 13 ++++++++-----
 2 files changed, 20 insertions(+), 33 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 65b7386c26d8..36b9f1bdd91f 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -537,22 +537,15 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
  */
 extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
 {
-	dev_t devt = MKDEV(0, 0);
+	struct hd_struct *part;
+	struct block_device *bdev = NULL;
 
-	if (partno == 0)
-		devt = disk_devt(disk);
-	else {
-		struct hd_struct *part;
+	part = disk_get_part(disk, partno);
+	if (part && (part->nr_sects || partno == 0))
+		bdev = bdget(part_devt(part));
+	disk_put_part(part);
 
-		part = disk_get_part(disk, partno);
-		if (part && part->nr_sects)
-			devt = part_devt(part);
-		disk_put_part(part);
-	}
-
-	if (likely(devt != MKDEV(0, 0)))
-		return bdget(devt);
-	return NULL;
+	return bdev;
 }
 EXPORT_SYMBOL(bdget_disk);
 
@@ -1000,27 +993,18 @@ dev_t blk_lookup_devt(const char *name, int partno)
 	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct gendisk *disk = dev_to_disk(dev);
+		struct hd_struct *part;
 
 		if (strcmp(dev->bus_id, name))
 			continue;
-		if (partno < 0 || partno >= disk_max_parts(disk))
-			continue;
-
-		if (partno == 0)
-			devt = disk_devt(disk);
-		else {
-			struct hd_struct *part;
-
-			part = disk_get_part(disk, partno);
-			if (!part || !part->nr_sects) {
-				disk_put_part(part);
-				continue;
-			}
 
+		part = disk_get_part(disk, partno);
+		if (part && (part->nr_sects || partno == 0)) {
 			devt = part_devt(part);
 			disk_put_part(part);
+			break;
 		}
-		break;
+		disk_put_part(part);
 	}
 	class_dev_iter_exit(&iter);
 	return devt;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1cf828148ec6..ff293ec8b3f7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -16,9 +16,9 @@
 #ifdef CONFIG_BLOCK
 
 #define kobj_to_dev(k)		container_of((k), struct device, kobj)
-#define dev_to_disk(device)	container_of((device), struct gendisk, __dev)
+#define dev_to_disk(device)	container_of((device), struct gendisk, part0.__dev)
 #define dev_to_part(device)	container_of((device), struct hd_struct, __dev)
-#define disk_to_dev(disk)	(&((disk)->__dev))
+#define disk_to_dev(disk)	(&(disk)->part0.__dev)
 #define part_to_dev(part)	(&((part)->__dev))
 
 extern struct device_type part_type;
@@ -141,7 +141,6 @@ struct gendisk {
 
 	int flags;
 	struct device *driverfs_dev;  // FIXME: remove
-	struct device __dev;
 	struct kobject *holder_dir;
 	struct kobject *slave_dir;
 
@@ -164,8 +163,12 @@ struct gendisk {
 
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
 {
-	if (likely(part))
-		return dev_to_disk(part_to_dev(part)->parent);
+	if (likely(part)) {
+		if (part->partno)
+			return dev_to_disk(part_to_dev(part)->parent);
+		else
+			return dev_to_disk(part_to_dev(part));
+	}
 	return NULL;
 }
 

From e56105214943ce5f0901d20e972a7cfd0d1d0656 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:09 +0900
Subject: [PATCH 046/132] block: unify sysfs size node handling

Now that capacity and __dev are moved to part0, part0 and others can
share the same method.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 10 +---------
 fs/partitions/check.c |  4 ++--
 include/linux/genhd.h |  3 +++
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 36b9f1bdd91f..c70db35076a0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -760,14 +760,6 @@ static ssize_t disk_ro_show(struct device *dev,
 	return sprintf(buf, "%d\n", disk->policy ? 1 : 0);
 }
 
-static ssize_t disk_size_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-
-	return sprintf(buf, "%llu\n", (unsigned long long)get_capacity(disk));
-}
-
 static ssize_t disk_capability_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
 {
@@ -835,7 +827,7 @@ static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
-static DEVICE_ATTR(size, S_IRUGO, disk_size_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 902b95f1f9d5..24d2c56d7d2d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -208,8 +208,8 @@ static ssize_t part_start_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
 }
 
-static ssize_t part_size_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_size_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ff293ec8b3f7..9cb8380cf0eb 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -591,6 +591,9 @@ extern void blk_register_region(dev_t devt, unsigned long range,
 			void *data);
 extern void blk_unregister_region(dev_t devt, unsigned long range);
 
+extern ssize_t part_size_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
+
 #else /* CONFIG_BLOCK */
 
 static inline void printk_all_partitions(void) { }

From b7db9956e57c8151b930d5e5fe5c766e6aad3ff7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:10 +0900
Subject: [PATCH 047/132] block: move policy from disk to part0

Move disk->policy to part0->policy.  Implement and use get_disk_ro().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 16 +++++-----------
 drivers/ide/ide-cd.c  |  2 +-
 drivers/md/dm-ioctl.c |  2 +-
 fs/partitions/check.c |  2 +-
 include/linux/genhd.h |  6 +++++-
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index c70db35076a0..70358f3c7423 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -757,7 +757,7 @@ static ssize_t disk_ro_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
-	return sprintf(buf, "%d\n", disk->policy ? 1 : 0);
+	return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
 }
 
 static ssize_t disk_capability_show(struct device *dev,
@@ -1090,10 +1090,7 @@ EXPORT_SYMBOL(put_disk);
 
 void set_device_ro(struct block_device *bdev, int flag)
 {
-	if (bdev->bd_contains != bdev)
-		bdev->bd_part->policy = flag;
-	else
-		bdev->bd_disk->policy = flag;
+	bdev->bd_part->policy = flag;
 }
 
 EXPORT_SYMBOL(set_device_ro);
@@ -1103,8 +1100,8 @@ void set_disk_ro(struct gendisk *disk, int flag)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 
-	disk->policy = flag;
-	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	disk_part_iter_init(&piter, disk,
+			    DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0);
 	while ((part = disk_part_iter_next(&piter)))
 		part->policy = flag;
 	disk_part_iter_exit(&piter);
@@ -1116,10 +1113,7 @@ int bdev_read_only(struct block_device *bdev)
 {
 	if (!bdev)
 		return 0;
-	else if (bdev->bd_contains != bdev)
-		return bdev->bd_part->policy;
-	else
-		return bdev->bd_disk->policy;
+	return bdev->bd_part->policy;
 }
 
 EXPORT_SYMBOL(bdev_read_only);
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f16bb4667238..03c2cb6a58bc 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1113,7 +1113,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 
 	if (write) {
 		/* disk has become write protected */
-		if (cd->disk->policy) {
+		if (get_disk_ro(cd->disk)) {
 			cdrom_end_request(drive, 0);
 			return ide_stopped;
 		}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c3de311117a1..5b919159f084 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -548,7 +548,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
 	 */
 	param->open_count = dm_open_count(md);
 
-	if (disk->policy)
+	if (get_disk_ro(disk))
 		param->flags |= DM_READONLY_FLAG;
 
 	param->event_nr = dm_get_event_nr(md);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 24d2c56d7d2d..ace6d03602c7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -375,7 +375,7 @@ int add_partition(struct gendisk *disk, int partno,
 	p->start_sect = start;
 	p->nr_sects = len;
 	p->partno = partno;
-	p->policy = disk->policy;
+	p->policy = get_disk_ro(disk);
 
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9cb8380cf0eb..4411bdd671dd 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -145,7 +145,6 @@ struct gendisk {
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;
-	int policy;
 
 	atomic_t sync_io;		/* RAID */
 	unsigned long stamp;
@@ -403,6 +402,11 @@ extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
 extern void set_device_ro(struct block_device *bdev, int flag);
 extern void set_disk_ro(struct gendisk *disk, int flag);
 
+static inline int get_disk_ro(struct gendisk *disk)
+{
+	return disk->part0.policy;
+}
+
 /* drivers/char/random.c */
 extern void add_disk_randomness(struct gendisk *disk);
 extern void rand_initialize_disk(struct gendisk *disk);

From 4c46501d1659475dc6c89554af6ce7fe6ecf615c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:11 +0900
Subject: [PATCH 048/132] block: move holder_dir from disk to part0

Move disk->holder_dir to part0->holder_dir.  Kill now mostly
superflous bdev_get_holder().

While at it, kill superflous kobject_get/put() around holder_dir,
slave_dir and cmd_filter creation and collapse
disk_sysfs_add_subdirs() into register_disk().  These serve no purpose
but obfuscating the code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cmd-filter.c    |  9 ++-------
 fs/block_dev.c        | 10 +---------
 fs/partitions/check.c | 15 +++------------
 include/linux/genhd.h |  1 -
 4 files changed, 6 insertions(+), 29 deletions(-)

diff --git a/block/cmd-filter.c b/block/cmd-filter.c
index 79c14996ac11..e669aed4c6bc 100644
--- a/block/cmd-filter.c
+++ b/block/cmd-filter.c
@@ -211,14 +211,10 @@ int blk_register_filter(struct gendisk *disk)
 {
 	int ret;
 	struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
-	struct kobject *parent = kobject_get(disk->holder_dir->parent);
 
-	if (!parent)
-		return -ENODEV;
-
-	ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent,
+	ret = kobject_init_and_add(&filter->kobj, &rcf_ktype,
+				   &disk_to_dev(disk)->kobj,
 				   "%s", "cmd_filter");
-
 	if (ret < 0)
 		return ret;
 
@@ -231,7 +227,6 @@ void blk_unregister_filter(struct gendisk *disk)
 	struct blk_cmd_filter *filter = &disk->queue->cmd_filter;
 
 	kobject_put(&filter->kobj);
-	kobject_put(disk->holder_dir->parent);
 }
 EXPORT_SYMBOL(blk_unregister_filter);
 #endif
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c982a9107979..57d572642854 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -548,14 +548,6 @@ static struct kobject *bdev_get_kobj(struct block_device *bdev)
 		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
 }
 
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(bdev->bd_part->holder_dir);
-	else
-		return kobject_get(bdev->bd_disk->holder_dir);
-}
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
@@ -608,7 +600,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
 	if (!bo->sdev)
 		goto fail_put_hdev;
 
-	bo->hdir = bdev_get_holder(bdev);
+	bo->hdir = kobject_get(bdev->bd_part->holder_dir);
 	if (!bo->hdir)
 		goto fail_put_sdev;
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ace6d03602c7..f0f604950ff4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -305,16 +305,6 @@ struct device_type part_type = {
 	.release	= part_release,
 };
 
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
-{
-	struct kobject *k;
-
-	k = kobject_get(&disk_to_dev(disk)->kobj);
-	disk->holder_dir = kobject_create_and_add("holders", k);
-	disk->slave_dir = kobject_create_and_add("slaves", k);
-	kobject_put(k);
-}
-
 static void delete_partition_rcu_cb(struct rcu_head *head)
 {
 	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
@@ -464,7 +454,8 @@ void register_disk(struct gendisk *disk)
 		return;
 	}
 #endif
-	disk_sysfs_add_subdirs(disk);
+	disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
 
 	/* No minors to use for partitions */
 	if (!disk_partitionable(disk))
@@ -592,7 +583,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_stat_set_all(disk, 0);
 	disk->stamp = 0;
 
-	kobject_put(disk->holder_dir);
+	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
 	disk->driverfs_dev = NULL;
 #ifndef CONFIG_SYSFS_DEPRECATED
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4411bdd671dd..2c0e1b597ab4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -141,7 +141,6 @@ struct gendisk {
 
 	int flags;
 	struct device *driverfs_dev;  // FIXME: remove
-	struct kobject *holder_dir;
 	struct kobject *slave_dir;
 
 	struct timer_rand_state *random;

From 0762b8bde9729f10f8e6249809660ff2ec3ad735 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:12 +0900
Subject: [PATCH 049/132] block: always set bdev->bd_part

Till now, bdev->bd_part is set only if the bdev was for parts other
than part0.  This patch makes bdev->bd_part always set so that code
paths don't have to differenciate common handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c      |  2 +-
 drivers/md/md.c       |  5 +---
 fs/block_dev.c        | 67 ++++++++++++++++++++-----------------------
 fs/partitions/check.c |  7 +----
 include/linux/genhd.h |  2 +-
 5 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index e0a5ee36849c..a4a7c08d2f20 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1274,7 +1274,7 @@ __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
-	    (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
+	    bio->bi_bdev->bd_part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 
 	return 0;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 96e9fccd2eab..2bd9cf416123 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1464,10 +1464,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
 		goto fail;
 
-	if (rdev->bdev->bd_part)
-		ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
-	else
-		ko = &disk_to_dev(rdev->bdev->bd_disk)->kobj;
+	ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
 	if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
 		kobject_del(&rdev->kobj);
 		goto fail;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 57d572642854..c3fa19bd64df 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -540,14 +540,6 @@ EXPORT_SYMBOL(bd_release);
  *           /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
  */
 
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
-	if (bdev->bd_contains != bdev)
-		return kobject_get(&part_to_dev(bdev->bd_part)->kobj);
-	else
-		return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj);
-}
-
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
@@ -596,7 +588,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
 	if (!bo->hdev)
 		goto fail_put_sdir;
 
-	bo->sdev = bdev_get_kobj(bdev);
+	bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
 	if (!bo->sdev)
 		goto fail_put_hdev;
 
@@ -919,7 +911,6 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
 
 static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
-	struct module *owner = NULL;
 	struct gendisk *disk;
 	struct hd_struct *part = NULL;
 	int ret;
@@ -941,25 +932,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 
 	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
+
 	lock_kernel();
+
 	disk = get_gendisk(bdev->bd_dev, &partno);
-	if (!disk) {
-		unlock_kernel();
-		bdput(bdev);
-		return ret;
-	}
-	owner = disk->fops->owner;
+	if (!disk)
+		goto out_unlock_kernel;
+	part = disk_get_part(disk, partno);
+	if (!part)
+		goto out_unlock_kernel;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_part = part;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev->bd_inode, file);
 				if (ret)
-					goto out_first;
+					goto out_clear;
 			}
 			if (!bdev->bd_openers) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -975,31 +968,32 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 			whole = bdget_disk(disk, 0);
 			ret = -ENOMEM;
 			if (!whole)
-				goto out_first;
+				goto out_clear;
 			BUG_ON(for_part);
 			ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
 			if (ret)
-				goto out_first;
+				goto out_clear;
 			bdev->bd_contains = whole;
-			part = disk_get_part(disk, partno);
 			bdev->bd_inode->i_data.backing_dev_info =
 			   whole->bd_inode->i_data.backing_dev_info;
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !part || !part->nr_sects) {
 				ret = -ENXIO;
-				goto out_first;
+				goto out_clear;
 			}
-			bdev->bd_part = part;
 			bd_set_size(bdev, (loff_t)part->nr_sects << 9);
 		}
 	} else {
+		disk_put_part(part);
 		put_disk(disk);
-		module_put(owner);
+		module_put(disk->fops->owner);
+		part = NULL;
+		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
 				if (ret)
-					goto out;
+					goto out_unlock_bdev;
 			}
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
@@ -1012,20 +1006,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 	unlock_kernel();
 	return 0;
 
-out_first:
+ out_clear:
 	bdev->bd_disk = NULL;
+	bdev->bd_part = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, 1);
 	bdev->bd_contains = NULL;
-	put_disk(disk);
-	disk_put_part(part);
-	module_put(owner);
-out:
+ out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
 	unlock_kernel();
-	if (ret)
-		bdput(bdev);
+
+	disk_put_part(part);
+	if (disk)
+		module_put(disk->fops->owner);
+	put_disk(disk);
+	bdput(bdev);
+
 	return ret;
 }
 
@@ -1110,11 +1108,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
 
 		put_disk(disk);
 		module_put(owner);
-
-		if (bdev->bd_contains != bdev) {
-			disk_put_part(bdev->bd_part);
-			bdev->bd_part = NULL;
-		}
+		disk_put_part(bdev->bd_part);
+		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
 		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 		if (bdev != bdev->bd_contains)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f0f604950ff4..87298c0fc8ce 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -134,12 +134,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
-	int partno = 0;
-
-	if (bdev->bd_part)
-		partno = bdev->bd_part->partno;
-
-	return disk_name(bdev->bd_disk, partno, buf);
+	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
 }
 
 EXPORT_SYMBOL(bdevname);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 2c0e1b597ab4..45a3682b5d87 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -412,7 +412,7 @@ extern void rand_initialize_disk(struct gendisk *disk);
 
 static inline sector_t get_start_sect(struct block_device *bdev)
 {
-	return bdev->bd_contains == bdev ? 0 : bdev->bd_part->start_sect;
+	return bdev->bd_part->start_sect;
 }
 static inline sector_t get_capacity(struct gendisk *disk)
 {

From eddb2e26b5ee3c5da68ba4bf1921ba20e2097bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:13 +0900
Subject: [PATCH 050/132] block: kill GENHD_FL_FAIL and use part0->make_it_fail

GENHD_FL_FAIL for disk is what make_it_fail is for parts.  Kill it and
use part0->make_it_fail.  Sysfs node handling is unified too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c      |  5 +++--
 block/genhd.c         | 30 +-----------------------------
 fs/partitions/check.c | 10 +++++-----
 include/linux/genhd.h |  8 +++++++-
 4 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a4a7c08d2f20..505ec61067df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1273,8 +1273,9 @@ __setup("fail_make_request=", setup_fail_make_request);
 
 static int should_fail_request(struct bio *bio)
 {
-	if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
-	    bio->bi_bdev->bd_part->make_it_fail)
+	struct hd_struct *part = bio->bi_bdev->bd_part;
+
+	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 
 	return 0;
diff --git a/block/genhd.c b/block/genhd.c
index 70358f3c7423..06a252f2b967 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -795,34 +795,6 @@ static ssize_t disk_stat_show(struct device *dev,
 		jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
 }
 
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t disk_fail_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-
-	return sprintf(buf, "%d\n", disk->flags & GENHD_FL_FAIL ? 1 : 0);
-}
-
-static ssize_t disk_fail_store(struct device *dev,
-			       struct device_attribute *attr,
-			       const char *buf, size_t count)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-	int i;
-
-	if (count > 0 && sscanf(buf, "%d", &i) > 0) {
-		if (i == 0)
-			disk->flags &= ~GENHD_FL_FAIL;
-		else
-			disk->flags |= GENHD_FL_FAIL;
-	}
-
-	return count;
-}
-
-#endif
-
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -832,7 +804,7 @@ static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
-	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, disk_fail_show, disk_fail_store);
+	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 #endif
 
 static struct attribute *disk_attrs[] = {
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 87298c0fc8ce..60592d9f43b6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_fail_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 
 	return sprintf(buf, "%d\n", p->make_it_fail);
 }
 
-static ssize_t part_fail_store(struct device *dev,
-			       struct device_attribute *attr,
-			       const char *buf, size_t count)
+ssize_t part_fail_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	int i;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 45a3682b5d87..3d15b42dc352 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -112,7 +112,6 @@ struct hd_struct {
 #define GENHD_FL_CD				8
 #define GENHD_FL_UP				16
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
-#define GENHD_FL_FAIL				64
 
 struct gendisk {
 	/* major, first_minor, minors and ext_minors are input
@@ -596,6 +595,13 @@ extern void blk_unregister_region(dev_t devt, unsigned long range);
 
 extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+extern ssize_t part_fail_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
+extern ssize_t part_fail_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count);
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
 
 #else /* CONFIG_BLOCK */
 

From 074a7aca7afa6f230104e8e65eba3420263714a5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:14 +0900
Subject: [PATCH 051/132] block: move stats from disk to part0

Move stats related fields - stamp, in_flight, dkstats - from disk to
part0 and unify stat handling such that...

* part_stat_*() now updates part0 together if the specified partition
  is not part0.  ie. part_stat_*() are now essentially all_stat_*().

* {disk|all}_stat_*() are gone.

* part_round_stats() is updated similary.  It handles part0 stats
  automatically and disk_round_stats() is killed.

* part_{inc|dec}_in_fligh() is implemented which automatically updates
  part0 stats for parts other than part0.

* disk_map_sector_rcu() is updated to return part0 if no part matches.
  Combined with the above changes, this makes NULL special case
  handling in callers unnecessary.

* Separate stats show code paths for disk are collapsed into part
  stats show code paths.

* Rename disk_stat_lock/unlock() to part_stat_lock/unlock()

While at it, reposition stat handling macros a bit and add missing
parentheses around macro parameters.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c           |  84 ++++++++-----------
 block/blk-merge.c          |  12 +--
 block/genhd.c              |  97 ++++++----------------
 drivers/block/aoe/aoecmd.c |  12 +--
 drivers/md/dm.c            |  27 +++---
 drivers/md/linear.c        |   9 +-
 drivers/md/md.c            |   4 +-
 drivers/md/multipath.c     |   9 +-
 drivers/md/raid0.c         |   9 +-
 drivers/md/raid1.c         |   9 +-
 drivers/md/raid10.c        |   9 +-
 drivers/md/raid5.c         |   9 +-
 fs/partitions/check.c      |  12 +--
 include/linux/genhd.h      | 165 +++++++++++--------------------------
 14 files changed, 168 insertions(+), 299 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 505ec61067df..98138f002524 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -61,21 +61,17 @@ static void drive_stat_acct(struct request *rq, int new_io)
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 
-	cpu = disk_stat_lock();
+	cpu = part_stat_lock();
 	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
 
 	if (!new_io)
-		all_stat_inc(cpu, rq->rq_disk, part, merges[rw], rq->sector);
+		part_stat_inc(cpu, part, merges[rw]);
 	else {
-		disk_round_stats(cpu, rq->rq_disk);
-		rq->rq_disk->in_flight++;
-		if (part) {
-			part_round_stats(cpu, part);
-			part->in_flight++;
-		}
+		part_round_stats(cpu, part);
+		part_inc_in_flight(part);
 	}
 
-	disk_stat_unlock();
+	part_stat_unlock();
 }
 
 void blk_queue_congestion_threshold(struct request_queue *q)
@@ -983,8 +979,22 @@ static inline void add_request(struct request_queue *q, struct request *req)
 	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
 
-/*
- * disk_round_stats()	- Round off the performance stats on a struct
+static void part_round_stats_single(int cpu, struct hd_struct *part,
+				    unsigned long now)
+{
+	if (now == part->stamp)
+		return;
+
+	if (part->in_flight) {
+		__part_stat_add(cpu, part, time_in_queue,
+				part->in_flight * (now - part->stamp));
+		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
+	}
+	part->stamp = now;
+}
+
+/**
+ * part_round_stats()	- Round off the performance stats on a struct
  * disk_stats.
  *
  * The average IO queue length and utilisation statistics are maintained
@@ -998,36 +1008,15 @@ static inline void add_request(struct request_queue *q, struct request *req)
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
-void disk_round_stats(int cpu, struct gendisk *disk)
-{
-	unsigned long now = jiffies;
-
-	if (now == disk->stamp)
-		return;
-
-	if (disk->in_flight) {
-		disk_stat_add(cpu, disk, time_in_queue,
-			      disk->in_flight * (now - disk->stamp));
-		disk_stat_add(cpu, disk, io_ticks, (now - disk->stamp));
-	}
-	disk->stamp = now;
-}
-EXPORT_SYMBOL_GPL(disk_round_stats);
-
 void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 
-	if (now == part->stamp)
-		return;
-
-	if (part->in_flight) {
-		part_stat_add(cpu, part, time_in_queue,
-			      part->in_flight * (now - part->stamp));
-		part_stat_add(cpu, part, io_ticks, (now - part->stamp));
-	}
-	part->stamp = now;
+	if (part->partno)
+		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
+	part_round_stats_single(cpu, part, now);
 }
+EXPORT_SYMBOL_GPL(part_round_stats);
 
 /*
  * queue lock must be held
@@ -1567,11 +1556,10 @@ static int __end_that_request_first(struct request *req, int error,
 		struct hd_struct *part;
 		int cpu;
 
-		cpu = disk_stat_lock();
+		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
-		all_stat_add(cpu, req->rq_disk, part, sectors[rw],
-			     nr_bytes >> 9, req->sector);
-		disk_stat_unlock();
+		part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
+		part_stat_unlock();
 	}
 
 	total_bytes = bio_nbytes = 0;
@@ -1758,19 +1746,15 @@ static void end_that_request_last(struct request *req, int error)
 		struct hd_struct *part;
 		int cpu;
 
-		cpu = disk_stat_lock();
+		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(disk, req->sector);
 
-		all_stat_inc(cpu, disk, part, ios[rw], req->sector);
-		all_stat_add(cpu, disk, part, ticks[rw], duration, req->sector);
-		disk_round_stats(cpu, disk);
-		disk->in_flight--;
-		if (part) {
-			part_round_stats(cpu, part);
-			part->in_flight--;
-		}
+		part_stat_inc(cpu, part, ios[rw]);
+		part_stat_add(cpu, part, ticks[rw], duration);
+		part_round_stats(cpu, part);
+		part_dec_in_flight(part);
 
-		disk_stat_unlock();
+		part_stat_unlock();
 	}
 
 	if (req->end_io)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d926a24bf1fd..c77196d55899 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -390,17 +390,13 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 		struct hd_struct *part;
 		int cpu;
 
-		cpu = disk_stat_lock();
+		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 
-		disk_round_stats(cpu, req->rq_disk);
-		req->rq_disk->in_flight--;
-		if (part) {
-			part_round_stats(cpu, part);
-			part->in_flight--;
-		}
+		part_round_stats(cpu, part);
+		part_dec_in_flight(part);
 
-		disk_stat_unlock();
+		part_stat_unlock();
 	}
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
diff --git a/block/genhd.c b/block/genhd.c
index 06a252f2b967..e1cb96fb883e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -176,7 +176,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
  * while preemption is disabled.
  *
  * RETURNS:
- * Found partition on success, NULL if there's no matching partition.
+ * Found partition on success, part0 is returned if no partition matches
  */
 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
@@ -189,7 +189,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 		    sector < part->start_sect + part->nr_sects)
 			return part;
 	}
-	return NULL;
+	return &disk->part0;
 }
 EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
 
@@ -580,24 +580,24 @@ void __init printk_all_partitions(void)
 		 * numbers in hex - the same format as the root=
 		 * option takes.
 		 */
-		printk("%s %10llu %s",
-		       bdevt_str(disk_devt(disk), devt_buf),
-		       (unsigned long long)get_capacity(disk) >> 1,
-		       disk_name(disk, 0, name_buf));
-		if (disk->driverfs_dev != NULL &&
-		    disk->driverfs_dev->driver != NULL)
-			printk(" driver: %s\n",
-			       disk->driverfs_dev->driver->name);
-		else
-			printk(" (driver?)\n");
+		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
+		while ((part = disk_part_iter_next(&piter))) {
+			bool is_part0 = part == &disk->part0;
 
-		/* now show the partitions */
-		disk_part_iter_init(&piter, disk, 0);
-		while ((part = disk_part_iter_next(&piter)))
-			printk("  %s %10llu %s\n",
+			printk("%s%s %10llu %s", is_part0 ? "" : "  ",
 			       bdevt_str(part_devt(part), devt_buf),
 			       (unsigned long long)part->nr_sects >> 1,
 			       disk_name(disk, part->partno, name_buf));
+			if (is_part0) {
+				if (disk->driverfs_dev != NULL &&
+				    disk->driverfs_dev->driver != NULL)
+					printk(" driver: %s\n",
+					      disk->driverfs_dev->driver->name);
+				else
+					printk(" (driver?)\n");
+			} else
+				printk("\n");
+		}
 		disk_part_iter_exit(&piter);
 	}
 	class_dev_iter_exit(&iter);
@@ -674,12 +674,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 		return 0;
 
 	/* show the full disk and all non-0 size partitions of it */
-	seq_printf(seqf, "%4d  %7d %10llu %s\n",
-		MAJOR(disk_devt(sgp)), MINOR(disk_devt(sgp)),
-		(unsigned long long)get_capacity(sgp) >> 1,
-		disk_name(sgp, 0, buf));
-
-	disk_part_iter_init(&piter, sgp, 0);
+	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
 	while ((part = disk_part_iter_next(&piter)))
 		seq_printf(seqf, "%4d  %7d %10llu %s\n",
 			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
@@ -768,40 +763,13 @@ static ssize_t disk_capability_show(struct device *dev,
 	return sprintf(buf, "%x\n", disk->flags);
 }
 
-static ssize_t disk_stat_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
-{
-	struct gendisk *disk = dev_to_disk(dev);
-	int cpu;
-
-	cpu = disk_stat_lock();
-	disk_round_stats(cpu, disk);
-	disk_stat_unlock();
-	return sprintf(buf,
-		"%8lu %8lu %8llu %8u "
-		"%8lu %8lu %8llu %8u "
-		"%8u %8u %8u"
-		"\n",
-		disk_stat_read(disk, ios[READ]),
-		disk_stat_read(disk, merges[READ]),
-		(unsigned long long)disk_stat_read(disk, sectors[READ]),
-		jiffies_to_msecs(disk_stat_read(disk, ticks[READ])),
-		disk_stat_read(disk, ios[WRITE]),
-		disk_stat_read(disk, merges[WRITE]),
-		(unsigned long long)disk_stat_read(disk, sectors[WRITE]),
-		jiffies_to_msecs(disk_stat_read(disk, ticks[WRITE])),
-		disk->in_flight,
-		jiffies_to_msecs(disk_stat_read(disk, io_ticks)),
-		jiffies_to_msecs(disk_stat_read(disk, time_in_queue)));
-}
-
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
-static DEVICE_ATTR(stat, S_IRUGO, disk_stat_show, NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -836,7 +804,7 @@ static void disk_release(struct device *dev)
 
 	kfree(disk->random);
 	kfree(disk->__part);
-	free_disk_stats(disk);
+	free_part_stats(&disk->part0);
 	kfree(disk);
 }
 struct class block_class = {
@@ -873,28 +841,11 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 				"\n\n");
 	*/
  
-	cpu = disk_stat_lock();
-	disk_round_stats(cpu, gp);
-	disk_stat_unlock();
-	seq_printf(seqf, "%4d %7d %s %lu %lu %llu %u %lu %lu %llu %u %u %u %u\n",
-		MAJOR(disk_devt(gp)), MINOR(disk_devt(gp)),
-		disk_name(gp, 0, buf),
-		disk_stat_read(gp, ios[0]), disk_stat_read(gp, merges[0]),
-		(unsigned long long)disk_stat_read(gp, sectors[0]),
-		jiffies_to_msecs(disk_stat_read(gp, ticks[0])),
-		disk_stat_read(gp, ios[1]), disk_stat_read(gp, merges[1]),
-		(unsigned long long)disk_stat_read(gp, sectors[1]),
-		jiffies_to_msecs(disk_stat_read(gp, ticks[1])),
-		gp->in_flight,
-		jiffies_to_msecs(disk_stat_read(gp, io_ticks)),
-		jiffies_to_msecs(disk_stat_read(gp, time_in_queue)));
-
-	/* now show all non-0 size partitions of it */
-	disk_part_iter_init(&piter, gp, 0);
+	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		cpu = disk_stat_lock();
+		cpu = part_stat_lock();
 		part_round_stats(cpu, hd);
-		disk_stat_unlock();
+		part_stat_unlock();
 		seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
 			   "%u %lu %lu %llu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
@@ -1000,7 +951,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 		int tot_minors = minors + ext_minors;
 		int size = tot_minors * sizeof(struct hd_struct *);
 
-		if (!init_disk_stats(disk)) {
+		if (!init_part_stats(&disk->part0)) {
 			kfree(disk);
 			return NULL;
 		}
@@ -1008,7 +959,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 		disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO,
 					    node_id);
 		if (!disk->__part) {
-			free_disk_stats(disk);
+				free_part_stats(&disk->part0);
 			kfree(disk);
 			return NULL;
 		}
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 934800f979c9..961d29a53cab 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -758,15 +758,15 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
 	struct hd_struct *part;
 	int cpu;
 
-	cpu = disk_stat_lock();
+	cpu = part_stat_lock();
 	part = disk_map_sector_rcu(disk, sector);
 
-	all_stat_inc(cpu, disk, part, ios[rw], sector);
-	all_stat_add(cpu, disk, part, ticks[rw], duration, sector);
-	all_stat_add(cpu, disk, part, sectors[rw], n_sect, sector);
-	all_stat_add(cpu, disk, part, io_ticks, duration, sector);
+	part_stat_inc(cpu, part, ios[rw]);
+	part_stat_add(cpu, part, ticks[rw], duration);
+	part_stat_add(cpu, part, sectors[rw], n_sect);
+	part_stat_add(cpu, part, io_ticks, duration);
 
-	disk_stat_unlock();
+	part_stat_unlock();
 }
 
 void
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 637806695bb9..327de03a5bdf 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -381,10 +381,10 @@ static void start_io_acct(struct dm_io *io)
 
 	io->start_time = jiffies;
 
-	cpu = disk_stat_lock();
-	disk_round_stats(cpu, dm_disk(md));
-	disk_stat_unlock();
-	dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
+	cpu = part_stat_lock();
+	part_round_stats(cpu, &dm_disk(md)->part0);
+	part_stat_unlock();
+	dm_disk(md)->part0.in_flight = atomic_inc_return(&md->pending);
 }
 
 static int end_io_acct(struct dm_io *io)
@@ -395,12 +395,13 @@ static int end_io_acct(struct dm_io *io)
 	int pending, cpu;
 	int rw = bio_data_dir(bio);
 
-	cpu = disk_stat_lock();
-	disk_round_stats(cpu, dm_disk(md));
-	disk_stat_add(cpu, dm_disk(md), ticks[rw], duration);
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_round_stats(cpu, &dm_disk(md)->part0);
+	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
+	part_stat_unlock();
 
-	dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
+	dm_disk(md)->part0.in_flight = pending =
+		atomic_dec_return(&md->pending);
 
 	return !pending;
 }
@@ -899,10 +900,10 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 
 	down_read(&md->io_lock);
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, dm_disk(md), ios[rw]);
-	disk_stat_add(cpu, dm_disk(md), sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
+	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
 
 	/*
 	 * If we're suspended we have to queue
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 00cbc8e47294..c80ea90593d3 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -325,10 +325,11 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 		return 0;
 	}
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
 
 	tmp_dev = which_dev(mddev, bio->bi_sector);
 	block = bio->bi_sector >> 1;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2bd9cf416123..0a3a4bdcd4af 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5546,8 +5546,8 @@ static int is_mddev_idle(mddev_t *mddev)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
-		curr_events = disk_stat_read(disk, sectors[0]) + 
-				disk_stat_read(disk, sectors[1]) - 
+		curr_events = part_stat_read(&disk->part0, sectors[0]) +
+				part_stat_read(&disk->part0, sectors[1]) -
 				atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 182f5a94cdc5..8bb8794129b3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -159,10 +159,11 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 	mp_bh->master_bio = bio;
 	mp_bh->mddev = mddev;
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
 
 	mp_bh->path = multipath_map(conf);
 	if (mp_bh->path < 0) {
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e26030fa59ab..f52f442a735f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -406,10 +406,11 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		return 0;
 	}
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
 
 	chunk_size = mddev->chunk_size >> 10;
 	chunk_sects = mddev->chunk_size >> 9;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index babb13036f93..b9764429d856 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -804,10 +804,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 	bitmap = mddev->bitmap;
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
 
 	/*
 	 * make_request() can abort the operation when READA is being
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5ec80da0a9d7..5f990133f5ef 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -844,10 +844,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	 */
 	wait_barrier(conf);
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bio));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bio));
+	part_stat_unlock();
 
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5899f211515f..ae16794bef20 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3396,10 +3396,11 @@ static int make_request(struct request_queue *q, struct bio * bi)
 
 	md_write_start(mddev, bi);
 
-	cpu = disk_stat_lock();
-	disk_stat_inc(cpu, mddev->gendisk, ios[rw]);
-	disk_stat_add(cpu, mddev->gendisk, sectors[rw], bio_sectors(bi));
-	disk_stat_unlock();
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
+		      bio_sectors(bi));
+	part_stat_unlock();
 
 	if (rw == READ &&
 	     mddev->reshape_position == MaxSector &&
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 60592d9f43b6..f517869e8d10 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -210,15 +210,15 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
-static ssize_t part_stat_show(struct device *dev,
-			      struct device_attribute *attr, char *buf)
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	int cpu;
 
-	cpu = disk_stat_lock();
+	cpu = part_stat_lock();
 	part_round_stats(cpu, p);
-	disk_stat_unlock();
+	part_stat_unlock();
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -575,8 +575,8 @@ void del_gendisk(struct gendisk *disk)
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
-	disk_stat_set_all(disk, 0);
-	disk->stamp = 0;
+	part_stat_set_all(&disk->part0, 0);
+	disk->part0.stamp = 0;
 
 	kobject_put(disk->part0.holder_dir);
 	kobject_put(disk->slave_dir);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3d15b42dc352..c90e1b4fbe5a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -145,13 +145,6 @@ struct gendisk {
 	struct timer_rand_state *random;
 
 	atomic_t sync_io;		/* RAID */
-	unsigned long stamp;
-	int in_flight;
-#ifdef	CONFIG_SMP
-	struct disk_stats *dkstats;
-#else
-	struct disk_stats dkstats;
-#endif
 	struct work_struct async_notify;
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
@@ -232,46 +225,18 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
  * internal use only.
  */
 #ifdef	CONFIG_SMP
-#define disk_stat_lock()	({ rcu_read_lock(); get_cpu(); })
-#define disk_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
+#define part_stat_lock()	({ rcu_read_lock(); get_cpu(); })
+#define part_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
 
-#define disk_stat_add(cpu, gendiskp, field, addnd)			\
-	(per_cpu_ptr(gendiskp->dkstats, cpu)->field += addnd)
-
-#define disk_stat_read(gendiskp, field)					\
-({									\
-	typeof(gendiskp->dkstats->field) res = 0;			\
-	int i;								\
-	for_each_possible_cpu(i)					\
-		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
-	res;								\
-})
-
-static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
-				sizeof(struct disk_stats));
-}		
-
-#define part_stat_add(cpu, part, field, addnd)				\
-	(per_cpu_ptr(part->dkstats, cpu)->field += addnd)
-
-#define all_stat_add(cpu, gendiskp, part, field, addnd, sector)		\
-({									\
-	if (part)							\
-		part_stat_add(cpu, part, field, addnd);			\
-	disk_stat_add(cpu, gendiskp, field, addnd);			\
-})
+#define __part_stat_add(cpu, part, field, addnd)			\
+	(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
 
 #define part_stat_read(part, field)					\
 ({									\
-	typeof(part->dkstats->field) res = 0;				\
+	typeof((part)->dkstats->field) res = 0;				\
 	int i;								\
 	for_each_possible_cpu(i)					\
-		res += per_cpu_ptr(part->dkstats, i)->field;		\
+		res += per_cpu_ptr((part)->dkstats, i)->field;		\
 	res;								\
 })
 
@@ -284,74 +249,6 @@ static inline void part_stat_set_all(struct hd_struct *part, int value)
 				sizeof(struct disk_stats));
 }
 
-#else /* !CONFIG_SMP */
-#define disk_stat_lock()	({ rcu_read_lock(); 0; })
-#define disk_stat_unlock()	rcu_read_unlock()
-
-#define disk_stat_add(cpu, gendiskp, field, addnd)			\
-	(gendiskp->dkstats.field += addnd)
-#define disk_stat_read(gendiskp, field)	(gendiskp->dkstats.field)
-
-static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)
-{
-	memset(&gendiskp->dkstats, value, sizeof (struct disk_stats));
-}
-
-#define part_stat_add(cpu, part, field, addnd)				\
-	(part->dkstats.field += addnd)
-
-#define all_stat_add(cpu, gendiskp, part, field, addnd, sector)		\
-({									\
-	if (part)							\
-		part_stat_add(cpu, part, field, addnd);			\
-	disk_stat_add(cpu, gendiskp, field, addnd);			\
-})
-
-#define part_stat_read(part, field)	(part->dkstats.field)
-
-static inline void part_stat_set_all(struct hd_struct *part, int value)
-{
-	memset(&part->dkstats, value, sizeof(struct disk_stats));
-}
-
-#endif /* CONFIG_SMP */
-
-#define disk_stat_dec(cpu, gendiskp, field)				\
-	disk_stat_add(cpu, gendiskp, field, -1)
-#define disk_stat_inc(cpu, gendiskp, field)				\
-	disk_stat_add(cpu, gendiskp, field, 1)
-#define disk_stat_sub(cpu, gendiskp, field, subnd)			\
-	disk_stat_add(cpu, gendiskp, field, -subnd)
-
-#define part_stat_dec(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, -1)
-#define part_stat_inc(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, 1)
-#define part_stat_sub(cpu, gendiskp, field, subnd)			\
-	part_stat_add(cpu, gendiskp, field, -subnd)
-
-#define all_stat_dec(cpu, gendiskp, field, sector)			\
-	all_stat_add(cpu, gendiskp, field, -1, sector)
-#define all_stat_inc(cpu, gendiskp, part, field, sector)		\
-	all_stat_add(cpu, gendiskp, part, field, 1, sector)
-#define all_stat_sub(cpu, gendiskp, part, field, subnd, sector)		\
-	all_stat_add(cpu, gendiskp, part, field, -subnd, sector)
-
-/* Inlines to alloc and free disk stats in struct gendisk */
-#ifdef  CONFIG_SMP
-static inline int init_disk_stats(struct gendisk *disk)
-{
-	disk->dkstats = alloc_percpu(struct disk_stats);
-	if (!disk->dkstats)
-		return 0;
-	return 1;
-}
-
-static inline void free_disk_stats(struct gendisk *disk)
-{
-	free_percpu(disk->dkstats);
-}
-
 static inline int init_part_stats(struct hd_struct *part)
 {
 	part->dkstats = alloc_percpu(struct disk_stats);
@@ -365,14 +262,18 @@ static inline void free_part_stats(struct hd_struct *part)
 	free_percpu(part->dkstats);
 }
 
-#else	/* CONFIG_SMP */
-static inline int init_disk_stats(struct gendisk *disk)
-{
-	return 1;
-}
+#else /* !CONFIG_SMP */
+#define part_stat_lock()	({ rcu_read_lock(); 0; })
+#define part_stat_unlock()	rcu_read_unlock()
 
-static inline void free_disk_stats(struct gendisk *disk)
+#define __part_stat_add(cpu, part, field, addnd)				\
+	((part)->dkstats.field += addnd)
+
+#define part_stat_read(part, field)	((part)->dkstats.field)
+
+static inline void part_stat_set_all(struct hd_struct *part, int value)
 {
+	memset(&part->dkstats, value, sizeof(struct disk_stats));
 }
 
 static inline int init_part_stats(struct hd_struct *part)
@@ -383,10 +284,38 @@ static inline int init_part_stats(struct hd_struct *part)
 static inline void free_part_stats(struct hd_struct *part)
 {
 }
-#endif	/* CONFIG_SMP */
+
+#endif /* CONFIG_SMP */
+
+#define part_stat_add(cpu, part, field, addnd)	do {			\
+	__part_stat_add((cpu), (part), field, addnd);			\
+	if ((part)->partno)						\
+		__part_stat_add((cpu), &part_to_disk((part))->part0,	\
+				field, addnd);				\
+} while (0)
+
+#define part_stat_dec(cpu, gendiskp, field)				\
+	part_stat_add(cpu, gendiskp, field, -1)
+#define part_stat_inc(cpu, gendiskp, field)				\
+	part_stat_add(cpu, gendiskp, field, 1)
+#define part_stat_sub(cpu, gendiskp, field, subnd)			\
+	part_stat_add(cpu, gendiskp, field, -subnd)
+
+static inline void part_inc_in_flight(struct hd_struct *part)
+{
+	part->in_flight++;
+	if (part->partno)
+		part_to_disk(part)->part0.in_flight++;
+}
+
+static inline void part_dec_in_flight(struct hd_struct *part)
+{
+	part->in_flight--;
+	if (part->partno)
+		part_to_disk(part)->part0.in_flight--;
+}
 
 /* drivers/block/ll_rw_blk.c */
-extern void disk_round_stats(int cpu, struct gendisk *disk);
 extern void part_round_stats(int cpu, struct hd_struct *part);
 
 /* drivers/block/genhd.c */
@@ -595,6 +524,8 @@ extern void blk_unregister_region(dev_t devt, unsigned long range);
 
 extern ssize_t part_size_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);
+extern ssize_t part_stat_show(struct device *dev,
+			      struct device_attribute *attr, char *buf);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 extern ssize_t part_fail_show(struct device *dev,
 			      struct device_attribute *attr, char *buf);

From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:15 +0900
Subject: [PATCH 052/132] block: make partition array dynamic

disk->__part used to be statically allocated to the maximum possible
number of partitions.  This patch makes partition array allocation
dynamic.  The added overhead is minimal as only real change is one
memory dereference changed to RCU one.  This saves both a bit of
memory and cpu cycles iterating through unoccupied slots and makes
increasing partition limit easier.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 129 +++++++++++++++++++++++++++++++++++-------
 block/ioctl.c         |   2 +-
 fs/partitions/check.c |  31 ++++++++--
 include/linux/genhd.h |  19 ++++++-
 4 files changed, 154 insertions(+), 27 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index e1cb96fb883e..c2b14aa69d58 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -52,14 +52,21 @@ static struct device_type disk_type;
  */
 struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
 {
-	struct hd_struct *part;
+	struct hd_struct *part = NULL;
+	struct disk_part_tbl *ptbl;
 
-	if (unlikely(partno < 0 || partno >= disk_max_parts(disk)))
+	if (unlikely(partno < 0))
 		return NULL;
+
 	rcu_read_lock();
-	part = rcu_dereference(disk->__part[partno]);
-	if (part)
-		get_device(part_to_dev(part));
+
+	ptbl = rcu_dereference(disk->part_tbl);
+	if (likely(partno < ptbl->len)) {
+		part = rcu_dereference(ptbl->part[partno]);
+		if (part)
+			get_device(part_to_dev(part));
+	}
+
 	rcu_read_unlock();
 
 	return part;
@@ -80,17 +87,24 @@ EXPORT_SYMBOL_GPL(disk_get_part);
 void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
 			  unsigned int flags)
 {
+	struct disk_part_tbl *ptbl;
+
+	rcu_read_lock();
+	ptbl = rcu_dereference(disk->part_tbl);
+
 	piter->disk = disk;
 	piter->part = NULL;
 
 	if (flags & DISK_PITER_REVERSE)
-		piter->idx = disk_max_parts(piter->disk) - 1;
+		piter->idx = ptbl->len - 1;
 	else if (flags & DISK_PITER_INCL_PART0)
 		piter->idx = 0;
 	else
 		piter->idx = 1;
 
 	piter->flags = flags;
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_init);
 
@@ -105,13 +119,16 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init);
  */
 struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 {
+	struct disk_part_tbl *ptbl;
 	int inc, end;
 
 	/* put the last partition */
 	disk_put_part(piter->part);
 	piter->part = NULL;
 
+	/* get part_tbl */
 	rcu_read_lock();
+	ptbl = rcu_dereference(piter->disk->part_tbl);
 
 	/* determine iteration parameters */
 	if (piter->flags & DISK_PITER_REVERSE) {
@@ -122,14 +139,14 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
 			end = 0;
 	} else {
 		inc = 1;
-		end = disk_max_parts(piter->disk);
+		end = ptbl->len;
 	}
 
 	/* iterate to the next partition */
 	for (; piter->idx != end; piter->idx += inc) {
 		struct hd_struct *part;
 
-		part = rcu_dereference(piter->disk->__part[piter->idx]);
+		part = rcu_dereference(ptbl->part[piter->idx]);
 		if (!part)
 			continue;
 		if (!(piter->flags & DISK_PITER_INCL_EMPTY) && !part->nr_sects)
@@ -180,10 +197,13 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
  */
 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
+	struct disk_part_tbl *ptbl;
 	int i;
 
-	for (i = 1; i < disk_max_parts(disk); i++) {
-		struct hd_struct *part = rcu_dereference(disk->__part[i]);
+	ptbl = rcu_dereference(disk->part_tbl);
+
+	for (i = 1; i < ptbl->len; i++) {
+		struct hd_struct *part = rcu_dereference(ptbl->part[i]);
 
 		if (part && part->start_sect <= sector &&
 		    sector < part->start_sect + part->nr_sects)
@@ -798,12 +818,86 @@ static struct attribute_group *disk_attr_groups[] = {
 	NULL
 };
 
+static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
+{
+	struct disk_part_tbl *ptbl =
+		container_of(head, struct disk_part_tbl, rcu_head);
+
+	kfree(ptbl);
+}
+
+/**
+ * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
+ * @disk: disk to replace part_tbl for
+ * @new_ptbl: new part_tbl to install
+ *
+ * Replace disk->part_tbl with @new_ptbl in RCU-safe way.  The
+ * original ptbl is freed using RCU callback.
+ *
+ * LOCKING:
+ * Matching bd_mutx locked.
+ */
+static void disk_replace_part_tbl(struct gendisk *disk,
+				  struct disk_part_tbl *new_ptbl)
+{
+	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+
+	rcu_assign_pointer(disk->part_tbl, new_ptbl);
+	if (old_ptbl)
+		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+}
+
+/**
+ * disk_expand_part_tbl - expand disk->part_tbl
+ * @disk: disk to expand part_tbl for
+ * @partno: expand such that this partno can fit in
+ *
+ * Expand disk->part_tbl such that @partno can fit in.  disk->part_tbl
+ * uses RCU to allow unlocked dereferencing for stats and other stuff.
+ *
+ * LOCKING:
+ * Matching bd_mutex locked, might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int disk_expand_part_tbl(struct gendisk *disk, int partno)
+{
+	struct disk_part_tbl *old_ptbl = disk->part_tbl;
+	struct disk_part_tbl *new_ptbl;
+	int len = old_ptbl ? old_ptbl->len : 0;
+	int target = partno + 1;
+	size_t size;
+	int i;
+
+	/* disk_max_parts() is zero during initialization, ignore if so */
+	if (disk_max_parts(disk) && target > disk_max_parts(disk))
+		return -EINVAL;
+
+	if (target <= len)
+		return 0;
+
+	size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
+	new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
+	if (!new_ptbl)
+		return -ENOMEM;
+
+	INIT_RCU_HEAD(&new_ptbl->rcu_head);
+	new_ptbl->len = target;
+
+	for (i = 0; i < len; i++)
+		rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
+
+	disk_replace_part_tbl(disk, new_ptbl);
+	return 0;
+}
+
 static void disk_release(struct device *dev)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
 	kfree(disk->random);
-	kfree(disk->__part);
+	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	kfree(disk);
 }
@@ -948,22 +1042,16 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 	disk = kmalloc_node(sizeof(struct gendisk),
 				GFP_KERNEL | __GFP_ZERO, node_id);
 	if (disk) {
-		int tot_minors = minors + ext_minors;
-		int size = tot_minors * sizeof(struct hd_struct *);
-
 		if (!init_part_stats(&disk->part0)) {
 			kfree(disk);
 			return NULL;
 		}
-
-		disk->__part = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO,
-					    node_id);
-		if (!disk->__part) {
-				free_part_stats(&disk->part0);
+		if (disk_expand_part_tbl(disk, 0)) {
+			free_part_stats(&disk->part0);
 			kfree(disk);
 			return NULL;
 		}
-		disk->__part[0] = &disk->part0;
+		disk->part_tbl->part[0] = &disk->part0;
 
 		disk->minors = minors;
 		disk->ext_minors = ext_minors;
@@ -973,6 +1061,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 		device_initialize(disk_to_dev(disk));
 		INIT_WORK(&disk->async_notify,
 			media_change_notify_thread);
+		disk->node_id = node_id;
 	}
 	return disk;
 }
diff --git a/block/ioctl.c b/block/ioctl.c
index 64e7c67a64b0..38bee321e1fa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -30,7 +30,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	if (bdev != bdev->bd_contains)
 		return -EINVAL;
 	partno = p.pno;
-	if (partno <= 0 || partno >= disk_max_parts(disk))
+	if (partno <= 0)
 		return -EINVAL;
 	switch (a.op) {
 		case BLKPG_ADD_PARTITION:
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f517869e8d10..772b2ed8d239 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,14 +312,18 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 
 void delete_partition(struct gendisk *disk, int partno)
 {
+	struct disk_part_tbl *ptbl = disk->part_tbl;
 	struct hd_struct *part;
 
-	part = disk->__part[partno];
+	if (partno >= ptbl->len)
+		return;
+
+	part = ptbl->part[partno];
 	if (!part)
 		return;
 
 	blk_free_devt(part_devt(part));
-	rcu_assign_pointer(disk->__part[partno], NULL);
+	rcu_assign_pointer(ptbl->part[partno], NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
@@ -341,10 +345,16 @@ int add_partition(struct gendisk *disk, int partno,
 	dev_t devt = MKDEV(0, 0);
 	struct device *ddev = disk_to_dev(disk);
 	struct device *pdev;
+	struct disk_part_tbl *ptbl;
 	const char *dname;
 	int err;
 
-	if (disk->__part[partno])
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return err;
+	ptbl = disk->part_tbl;
+
+	if (ptbl->part[partno])
 		return -EBUSY;
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
@@ -398,7 +408,7 @@ int add_partition(struct gendisk *disk, int partno,
 
 	/* everything is up and running, commence */
 	INIT_RCU_HEAD(&p->rcu_head);
-	rcu_assign_pointer(disk->__part[partno], p);
+	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
 	if (!ddev->uevent_suppress)
@@ -487,7 +497,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	struct parsed_partitions *state;
-	int p, res;
+	int p, highest, res;
 
 	if (bdev->bd_part_count)
 		return -EBUSY;
@@ -511,6 +521,17 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	/* tell userspace that the media / partition table may have changed */
 	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
 
+	/* Detect the highest partition number and preallocate
+	 * disk->part_tbl.  This is an optimization and not strictly
+	 * necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
+
+	disk_expand_part_tbl(disk, highest);
+
+	/* add partitions */
 	for (p = 1; p < state->limit; p++) {
 		sector_t size = state->parts[p].size;
 		sector_t from = state->parts[p].from;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c90e1b4fbe5a..ecf649c3deed 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -113,6 +113,21 @@ struct hd_struct {
 #define GENHD_FL_UP				16
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 
+#define BLK_SCSI_MAX_CMDS	(256)
+#define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
+
+struct blk_scsi_cmd_filter {
+	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
+	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
+	struct kobject kobj;
+};
+
+struct disk_part_tbl {
+	struct rcu_head rcu_head;
+	int len;
+	struct hd_struct *part[];
+};
+
 struct gendisk {
 	/* major, first_minor, minors and ext_minors are input
 	 * parameters only, don't use directly.  Use disk_devt() and
@@ -131,7 +146,7 @@ struct gendisk {
 	 * non-critical accesses use RCU.  Always access through
 	 * helpers.
 	 */
-	struct hd_struct **__part;
+	struct disk_part_tbl *part_tbl;
 	struct hd_struct part0;
 
 	struct block_device_operations *fops;
@@ -149,6 +164,7 @@ struct gendisk {
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *integrity;
 #endif
+	int node_id;
 };
 
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
@@ -503,6 +519,7 @@ extern void blk_free_devt(dev_t devt);
 extern dev_t blk_lookup_devt(const char *name, int partno);
 extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
+extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
 extern void delete_partition(struct gendisk *, int);

From 689d6fac40b41c7bf154f362deaf442548e4dc81 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:16 +0900
Subject: [PATCH 053/132] block: replace @ext_minors with GENHD_FL_EXT_DEVT

With previous changes, it's meaningless to limit the number of
partitions.  Replace @ext_minors with GENHD_FL_EXT_DEVT such that
setting the flag allows the disk to have maximum number of allowed
partitions (only limited by the number of entries in parsed_partitions
as determined by MAX_PART constant).

This kills not-too-pretty alloc_disk_ext[_node]() functions and makes
@minors parameter to alloc_disk[_node]() unnecessary.  The parameter
is left alone to avoid disturbing the users.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c          | 16 +---------------
 drivers/ide/ide-disk.c | 14 +++++---------
 drivers/scsi/sd.c      |  9 ++-------
 fs/partitions/check.h  |  4 +---
 include/linux/genhd.h  | 16 ++++++++--------
 5 files changed, 17 insertions(+), 42 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index c2b14aa69d58..eedab5b4685b 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1024,18 +1024,9 @@ struct gendisk *alloc_disk(int minors)
 {
 	return alloc_disk_node(minors, -1);
 }
+EXPORT_SYMBOL(alloc_disk);
 
 struct gendisk *alloc_disk_node(int minors, int node_id)
-{
-	return alloc_disk_ext_node(minors, 0, node_id);
-}
-
-struct gendisk *alloc_disk_ext(int minors, int ext_minors)
-{
-	return alloc_disk_ext_node(minors, ext_minors, -1);
-}
-
-struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 {
 	struct gendisk *disk;
 
@@ -1054,7 +1045,6 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 		disk->part_tbl->part[0] = &disk->part0;
 
 		disk->minors = minors;
-		disk->ext_minors = ext_minors;
 		rand_initialize_disk(disk);
 		disk_to_dev(disk)->class = &block_class;
 		disk_to_dev(disk)->type = &disk_type;
@@ -1065,11 +1055,7 @@ struct gendisk *alloc_disk_ext_node(int minors, int ext_minors, int node_id)
 	}
 	return disk;
 }
-
-EXPORT_SYMBOL(alloc_disk);
 EXPORT_SYMBOL(alloc_disk_node);
-EXPORT_SYMBOL(alloc_disk_ext);
-EXPORT_SYMBOL(alloc_disk_ext_node);
 
 struct kobject *get_disk(struct gendisk *disk)
 {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a072df5053ae..29c8ae752683 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -41,16 +41,12 @@
 #include <asm/io.h>
 #include <asm/div64.h>
 
-#define IDE_DISK_PARTS		(1 << PARTN_BITS)
-
 #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
-#define IDE_DISK_MINORS		IDE_DISK_PARTS
+#define IDE_DISK_MINORS		(1 << PARTN_BITS)
 #else
 #define IDE_DISK_MINORS		1
 #endif
 
-#define IDE_DISK_EXT_MINORS	(IDE_DISK_PARTS - IDE_DISK_MINORS)
-
 struct ide_disk_obj {
 	ide_drive_t	*drive;
 	ide_driver_t	*driver;
@@ -1161,8 +1157,7 @@ static int ide_disk_probe(ide_drive_t *drive)
 	if (!idkp)
 		goto failed;
 
-	g = alloc_disk_ext_node(IDE_DISK_MINORS, IDE_DISK_EXT_MINORS,
-				hwif_to_node(drive->hwif));
+	g = alloc_disk_node(IDE_DISK_MINORS, hwif_to_node(drive->hwif));
 	if (!g)
 		goto out_free_idkp;
 
@@ -1189,9 +1184,10 @@ static int ide_disk_probe(ide_drive_t *drive)
 		drive->attach = 1;
 
 	g->minors = IDE_DISK_MINORS;
-	g->ext_minors = IDE_DISK_EXT_MINORS;
 	g->driverfs_dev = &drive->gendev;
-	g->flags = drive->removable ? GENHD_FL_REMOVABLE : 0;
+	g->flags |= GENHD_FL_EXT_DEVT;
+	if (drive->removable)
+		g->flags |= GENHD_FL_REMOVABLE;
 	set_capacity(g, idedisk_capacity(drive));
 	g->fops = &idedisk_ops;
 	add_disk(g);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 280d231a86ed..6598024531dd 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -86,16 +86,12 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
 
-#define SD_PARTS	64
-
 #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define SD_MINORS	16
 #else
 #define SD_MINORS	1
 #endif
 
-#define SD_EXT_MINORS	(SD_PARTS - SD_MINORS)
-
 static int  sd_revalidate_disk(struct gendisk *);
 static int  sd_probe(struct device *);
 static int  sd_remove(struct device *);
@@ -1811,7 +1807,7 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
-	gd = alloc_disk_ext(SD_MINORS, SD_EXT_MINORS);
+	gd = alloc_disk(SD_MINORS);
 	if (!gd)
 		goto out_free;
 
@@ -1856,7 +1852,6 @@ static int sd_probe(struct device *dev)
 	gd->major = sd_major((index & 0xf0) >> 4);
 	gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
 	gd->minors = SD_MINORS;
-	gd->ext_minors = SD_EXT_MINORS;
 	gd->fops = &sd_fops;
 
 	if (index < 26) {
@@ -1880,7 +1875,7 @@ static int sd_probe(struct device *dev)
 	blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
 
 	gd->driverfs_dev = &sdp->sdev_gendev;
-	gd->flags = GENHD_FL_DRIVERFS;
+	gd->flags = GENHD_FL_EXT_DEVT | GENHD_FL_DRIVERFS;
 	if (sdp->removable)
 		gd->flags |= GENHD_FL_REMOVABLE;
 
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8b..98dbe1a84528 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
  * add_gd_partition adds a partitions details to the devices partition
  * description.
  */
-enum { MAX_PART = 256 };
-
 struct parsed_partitions {
 	char name[BDEVNAME_SIZE];
 	struct {
 		sector_t from;
 		sector_t size;
 		int flags;
-	} parts[MAX_PART];
+	} parts[DISK_MAX_PARTS];
 	int next;
 	int limit;
 };
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ecf649c3deed..04524c213de1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -58,6 +58,8 @@ enum {
 	UNIXWARE_PARTITION = 0x63,	/* Same as GNU_HURD and SCO Unix */
 };
 
+#define DISK_MAX_PARTS			256
+
 #include <linux/major.h>
 #include <linux/device.h>
 #include <linux/smp.h>
@@ -112,6 +114,7 @@ struct hd_struct {
 #define GENHD_FL_CD				8
 #define GENHD_FL_UP				16
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
+#define GENHD_FL_EXT_DEVT			64 /* allow extended devt */
 
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@@ -129,15 +132,13 @@ struct disk_part_tbl {
 };
 
 struct gendisk {
-	/* major, first_minor, minors and ext_minors are input
-	 * parameters only, don't use directly.  Use disk_devt() and
-	 * disk_max_parts().
+	/* major, first_minor and minors are input parameters only,
+	 * don't use directly.  Use disk_devt() and disk_max_parts().
 	 */
 	int major;			/* major number of driver */
 	int first_minor;
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
-	int ext_minors;			/* number of extended dynamic minors */
 
 	char disk_name[32];		/* name of major driver */
 
@@ -180,7 +181,9 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
 
 static inline int disk_max_parts(struct gendisk *disk)
 {
-	return disk->minors + disk->ext_minors;
+	if (disk->flags & GENHD_FL_EXT_DEVT)
+		return DISK_MAX_PARTS;
+	return disk->minors;
 }
 
 static inline bool disk_partitionable(struct gendisk *disk)
@@ -527,9 +530,6 @@ extern void printk_all_partitions(void);
 
 extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
-extern struct gendisk *alloc_disk_ext_node(int minors, int ext_minrs,
-					   int node_id);
-extern struct gendisk *alloc_disk_ext(int minors, int ext_minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
 extern void blk_register_region(dev_t devt, unsigned long range,

From 3e1a7ff8a0a7b948f2684930166954f9e8e776fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Aug 2008 19:56:17 +0900
Subject: [PATCH 054/132] block: allow disk to have extended device number

Now that disk and partition handlings are mostly unified, it's easy to
allow disk to have extended device number.  This patch makes
add_disk() use extended device number if disk->minors is zero.  Both
sd and ide-disk are updated to use this.

* sd_format_disk_name() is implemented which can generically determine
  the drive name.  This removes disk number restriction stemming from
  limited device names.

* If sd index goes over SD_MAX_DISKS (which can be increased now BTW),
  sd simply doesn't initialize minors letting block layer choose
  extended device number.

* If CONFIG_DEBUG_EXT_DEVT is set, both sd and ide-disk always set
  minors to 0 and use extended device numbers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c          | 25 +++++++++++++-
 drivers/ide/ide-disk.c |  2 +-
 drivers/scsi/sd.c      | 74 ++++++++++++++++++++++++++++++------------
 fs/partitions/check.c  |  1 +
 include/linux/genhd.h  |  3 +-
 5 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index eedab5b4685b..d9de3e482d1e 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -478,14 +478,37 @@ static int exact_lock(dev_t devt, void *data)
  *
  * This function registers the partitioning information in @disk
  * with the kernel.
+ *
+ * FIXME: error handling
  */
 void add_disk(struct gendisk *disk)
 {
 	struct backing_dev_info *bdi;
+	dev_t devt;
 	int retval;
 
+	/* minors == 0 indicates to use ext devt from part0 and should
+	 * be accompanied with EXT_DEVT flag.  Make sure all
+	 * parameters make sense.
+	 */
+	WARN_ON(disk->minors && !(disk->major || disk->first_minor));
+	WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
+
 	disk->flags |= GENHD_FL_UP;
-	disk_to_dev(disk)->devt = MKDEV(disk->major, disk->first_minor);
+
+	retval = blk_alloc_devt(&disk->part0, &devt);
+	if (retval) {
+		WARN_ON(1);
+		return;
+	}
+	disk_to_dev(disk)->devt = devt;
+
+	/* ->major and ->first_minor aren't supposed to be
+	 * dereferenced from here on, but set them just in case.
+	 */
+	disk->major = MAJOR(devt);
+	disk->first_minor = MINOR(devt);
+
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
 	register_disk(disk);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 29c8ae752683..33ea8c048717 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -44,7 +44,7 @@
 #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define IDE_DISK_MINORS		(1 << PARTN_BITS)
 #else
-#define IDE_DISK_MINORS		1
+#define IDE_DISK_MINORS		0
 #endif
 
 struct ide_disk_obj {
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 6598024531dd..bcb04b2a7676 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -89,7 +89,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
 #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define SD_MINORS	16
 #else
-#define SD_MINORS	1
+#define SD_MINORS	0
 #endif
 
 static int  sd_revalidate_disk(struct gendisk *);
@@ -1769,6 +1769,52 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
+/**
+ *	sd_format_disk_name - format disk name
+ *	@prefix: name prefix - ie. "sd" for SCSI disks
+ *	@index: index of the disk to format name for
+ *	@buf: output buffer
+ *	@buflen: length of the output buffer
+ *
+ *	SCSI disk names starts at sda.  The 26th device is sdz and the
+ *	27th is sdaa.  The last one for two lettered suffix is sdzz
+ *	which is followed by sdaaa.
+ *
+ *	This is basically 26 base counting with one extra 'nil' entry
+ *	at the beggining from the second digit on and can be
+ *	determined using similar method as 26 base conversion with the
+ *	index shifted -1 after each digit is computed.
+ *
+ *	CONTEXT:
+ *	Don't care.
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
 /**
  *	sd_probe - called during driver initialization and whenever a
  *	new scsi device is attached to the system. It is called once
@@ -1821,8 +1867,8 @@ static int sd_probe(struct device *dev)
 	if (error)
 		goto out_put;
 
-	error = -EBUSY;
-	if (index >= SD_MAX_DISKS)
+	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
+	if (error)
 		goto out_free_index;
 
 	sdkp->device = sdp;
@@ -1849,24 +1895,12 @@ static int sd_probe(struct device *dev)
 
 	get_device(&sdp->sdev_gendev);
 
-	gd->major = sd_major((index & 0xf0) >> 4);
-	gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
-	gd->minors = SD_MINORS;
-	gd->fops = &sd_fops;
-
-	if (index < 26) {
-		sprintf(gd->disk_name, "sd%c", 'a' + index % 26);
-	} else if (index < (26 + 1) * 26) {
-		sprintf(gd->disk_name, "sd%c%c",
-			'a' + index / 26 - 1,'a' + index % 26);
-	} else {
-		const unsigned int m1 = (index / 26 - 1) / 26 - 1;
-		const unsigned int m2 = (index / 26 - 1) % 26;
-		const unsigned int m3 =  index % 26;
-		sprintf(gd->disk_name, "sd%c%c%c",
-			'a' + m1, 'a' + m2, 'a' + m3);
+	if (index < SD_MAX_DISKS) {
+		gd->major = sd_major((index & 0xf0) >> 4);
+		gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
+		gd->minors = SD_MINORS;
 	}
-
+	gd->fops = &sd_fops;
 	gd->private_data = &sdkp->driver;
 	gd->queue = sdkp->device->request_queue;
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 772b2ed8d239..0e411603fdf5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -593,6 +593,7 @@ void del_gendisk(struct gendisk *disk)
 	disk_part_iter_exit(&piter);
 
 	invalidate_partition(disk, 0);
+	blk_free_devt(disk_to_dev(disk)->devt);
 	set_capacity(disk, 0);
 	disk->flags &= ~GENHD_FL_UP;
 	unlink_gendisk(disk);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 04524c213de1..206cdf96c3a7 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -59,6 +59,7 @@ enum {
 };
 
 #define DISK_MAX_PARTS			256
+#define DISK_NAME_LEN			32
 
 #include <linux/major.h>
 #include <linux/device.h>
@@ -140,7 +141,7 @@ struct gendisk {
 	int minors;                     /* maximum number of minors, =1 for
                                          * disks that can't be partitioned. */
 
-	char disk_name[32];		/* name of major driver */
+	char disk_name[DISK_NAME_LEN];	/* name of major driver */
 
 	/* Array of pointers to partitions indexed by partno.
 	 * Protected with matching bdev lock but stat and other

From 0835da67c11e879ed5dc23160934d8970470a2ce Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 26 Aug 2008 09:15:47 +0200
Subject: [PATCH 055/132] block: use linux/uaccess.h in elevator.c instead of
 asm variant

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 269615e6dbf5..8e3fc3afc77b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -34,8 +34,7 @@
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);

From b646fc59b332ef307895558c9cd1359dc2d25813 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 28 Jul 2008 13:06:00 +0200
Subject: [PATCH 056/132] block: split softirq handling into blk-softirq.c

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile      |   4 +-
 block/blk-core.c    |  88 -------------------------------------
 block/blk-softirq.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 90 deletions(-)
 create mode 100644 block/blk-softirq.c

diff --git a/block/Makefile b/block/Makefile
index 208000b0750d..0da976ce67dd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,8 +4,8 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \
-			cmd-filter.o
+			blk-exec.o blk-merge.o blk-softirq.o ioctl.o genhd.o \
+			scsi_ioctl.o cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 98138f002524..527b3382a610 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,8 +26,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -50,8 +48,6 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
 static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
@@ -1643,82 +1639,6 @@ static int __end_that_request_first(struct request *req, int error,
 	return 1;
 }
 
-/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-	struct list_head *cpu_list, local_list;
-
-	local_irq_disable();
-	cpu_list = &__get_cpu_var(blk_cpu_done);
-	list_replace_init(cpu_list, &local_list);
-	local_irq_enable();
-
-	while (!list_empty(&local_list)) {
-		struct request *rq;
-
-		rq = list_entry(local_list.next, struct request, donelist);
-		list_del_init(&rq->donelist);
-		rq->q->softirq_done_fn(rq);
-	}
-}
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
-				    unsigned long action, void *hcpu)
-{
-	/*
-	 * If a CPU goes away, splice its entries to the current CPU
-	 * and trigger a run of the softirq
-	 */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		int cpu = (unsigned long) hcpu;
-
-		local_irq_disable();
-		list_splice_init(&per_cpu(blk_cpu_done, cpu),
-				 &__get_cpu_var(blk_cpu_done));
-		raise_softirq_irqoff(BLOCK_SOFTIRQ);
-		local_irq_enable();
-	}
-
-	return NOTIFY_OK;
-}
-
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
-	.notifier_call	= blk_cpu_notify,
-};
-
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-
-void blk_complete_request(struct request *req)
-{
-	struct list_head *cpu_list;
-	unsigned long flags;
-
-	BUG_ON(!req->q->softirq_done_fn);
-
-	local_irq_save(flags);
-
-	cpu_list = &__get_cpu_var(blk_cpu_done);
-	list_add_tail(&req->donelist, cpu_list);
-	raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
 /*
  * queue lock must be held
  */
@@ -2053,8 +1973,6 @@ EXPORT_SYMBOL(kblockd_flush_work);
 
 int __init blk_dev_init(void)
 {
-	int i;
-
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
@@ -2065,12 +1983,6 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
-	for_each_possible_cpu(i)
-		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
-	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
-	register_hotcpu_notifier(&blk_cpu_notifier);
-
 	return 0;
 }
 
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
new file mode 100644
index 000000000000..9e1c43bff662
--- /dev/null
+++ b/block/blk-softirq.c
@@ -0,0 +1,103 @@
+/*
+ * Functions related to softirq rq completions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self,
+				    unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_done, cpu),
+				 &__get_cpu_var(blk_cpu_done));
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+
+static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+	.notifier_call	= blk_cpu_notify,
+};
+
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+	struct list_head *cpu_list, local_list;
+
+	local_irq_disable();
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_replace_init(cpu_list, &local_list);
+	local_irq_enable();
+
+	while (!list_empty(&local_list)) {
+		struct request *rq;
+
+		rq = list_entry(local_list.next, struct request, donelist);
+		list_del_init(&rq->donelist);
+		rq->q->softirq_done_fn(rq);
+	}
+}
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completion callback
+ *     through requeueing. The actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+
+void blk_complete_request(struct request *req)
+{
+	struct list_head *cpu_list;
+	unsigned long flags;
+
+	BUG_ON(!req->q->softirq_done_fn);
+
+	local_irq_save(flags);
+
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_add_tail(&req->donelist, cpu_list);
+	raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
+int __init blk_softirq_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+	register_hotcpu_notifier(&blk_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_softirq_init);

From 18887ad910e56066233a07fd3cfb2fa11338b782 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 28 Jul 2008 13:08:45 +0200
Subject: [PATCH 057/132] block: make kblockd_schedule_work() take the queue as
 parameter

Preparatory patch for checking queuing affinity.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c     | 6 +++---
 block/blk-core.c       | 8 ++++----
 block/cfq-iosched.c    | 2 +-
 include/linux/blkdev.h | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index cf4eb0eefbbf..80af9257e64a 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -462,7 +462,7 @@ static void as_antic_stop(struct as_data *ad)
 			del_timer(&ad->antic_timer);
 		ad->antic_status = ANTIC_FINISHED;
 		/* see as_work_handler */
-		kblockd_schedule_work(&ad->antic_work);
+		kblockd_schedule_work(ad->q, &ad->antic_work);
 	}
 }
 
@@ -483,7 +483,7 @@ static void as_antic_timeout(unsigned long data)
 		aic = ad->io_context->aic;
 
 		ad->antic_status = ANTIC_FINISHED;
-		kblockd_schedule_work(&ad->antic_work);
+		kblockd_schedule_work(q, &ad->antic_work);
 
 		if (aic->ttime_samples == 0) {
 			/* process anticipated on has exited or timed out*/
@@ -844,7 +844,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
 	if (ad->changed_batch && ad->nr_dispatched == 1) {
 		ad->current_batch_expires = jiffies +
 					ad->batch_expire[ad->batch_data_dir];
-		kblockd_schedule_work(&ad->antic_work);
+		kblockd_schedule_work(q, &ad->antic_work);
 		ad->changed_batch = 0;
 
 		if (ad->batch_data_dir == REQ_SYNC)
diff --git a/block/blk-core.c b/block/blk-core.c
index 527b3382a610..9c6f818d0c33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -305,7 +305,7 @@ void blk_unplug_timeout(unsigned long data)
 	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
-	kblockd_schedule_work(&q->unplug_work);
+	kblockd_schedule_work(q, &q->unplug_work);
 }
 
 void blk_unplug(struct request_queue *q)
@@ -346,7 +346,7 @@ void blk_start_queue(struct request_queue *q)
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
 		blk_plug_device(q);
-		kblockd_schedule_work(&q->unplug_work);
+		kblockd_schedule_work(q, &q->unplug_work);
 	}
 }
 EXPORT_SYMBOL(blk_start_queue);
@@ -411,7 +411,7 @@ void __blk_run_queue(struct request_queue *q)
 			queue_flag_clear(QUEUE_FLAG_REENTER, q);
 		} else {
 			blk_plug_device(q);
-			kblockd_schedule_work(&q->unplug_work);
+			kblockd_schedule_work(q, &q->unplug_work);
 		}
 	}
 }
@@ -1959,7 +1959,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
-int kblockd_schedule_work(struct work_struct *work)
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1e2aff812ee2..5f6fd287c185 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -244,7 +244,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
-		kblockd_schedule_work(&cfqd->unplug_work);
+		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1adb03827bd3..10aa46c8f170 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -912,7 +912,7 @@ static inline void put_dev_sector(Sector p)
 }
 
 struct work_struct;
-int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \

From c7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sat, 13 Sep 2008 20:26:01 +0200
Subject: [PATCH 058/132] block: add support for IO CPU affinity

This patch adds support for controlling the IO completion CPU of
either all requests on a queue, or on a per-request basis. We export
a sysfs variable (rq_affinity) which, if set, migrates completions
of requests to the CPU that originally submitted it. A bio helper
(bio_set_completion_cpu()) is also added, so that queuers can ask
for completion on that specific CPU.

In testing, this has been show to cut the system time by as much
as 20-40% on synthetic workloads where CPU affinity is desired.

This requires a little help from the architecture, so it'll only
work as designed for archs that are using the new generic smp
helper infrastructure.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c         |  46 +++++++-------
 block/blk-settings.c     |   2 +-
 block/blk-softirq.c      | 126 +++++++++++++++++++++++++++++----------
 block/blk-sysfs.c        |  31 ++++++++++
 block/blk.h              |  12 ++++
 fs/bio.c                 |   1 +
 include/linux/bio.h      |  11 ++++
 include/linux/blkdev.h   |   5 +-
 include/linux/elevator.h |   8 +--
 9 files changed, 182 insertions(+), 60 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9c6f818d0c33..5484838f46e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -110,7 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	memset(rq, 0, sizeof(*rq));
 
 	INIT_LIST_HEAD(&rq->queuelist);
-	INIT_LIST_HEAD(&rq->donelist);
+	rq->cpu = -1;
 	rq->q = q;
 	rq->sector = rq->hard_sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
@@ -322,6 +322,21 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
 
+static void blk_invoke_request_fn(struct request_queue *q)
+{
+	/*
+	 * one level of recursion is ok and is much faster than kicking
+	 * the unplug handling
+	 */
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+		q->request_fn(q);
+		queue_flag_clear(QUEUE_FLAG_REENTER, q);
+	} else {
+		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+		kblockd_schedule_work(q, &q->unplug_work);
+	}
+}
+
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -336,18 +351,7 @@ void blk_start_queue(struct request_queue *q)
 	WARN_ON(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-
-	/*
-	 * one level of recursion is ok and is much faster than kicking
-	 * the unplug handling
-	 */
-	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-		q->request_fn(q);
-		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else {
-		blk_plug_device(q);
-		kblockd_schedule_work(q, &q->unplug_work);
-	}
+	blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -405,15 +409,8 @@ void __blk_run_queue(struct request_queue *q)
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
-	if (!elv_queue_empty(q)) {
-		if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-			q->request_fn(q);
-			queue_flag_clear(QUEUE_FLAG_REENTER, q);
-		} else {
-			blk_plug_device(q);
-			kblockd_schedule_work(q, &q->unplug_work);
-		}
-	}
+	if (!elv_queue_empty(q))
+		blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -1056,6 +1053,7 @@ EXPORT_SYMBOL(blk_put_request);
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
+	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 
 	/*
@@ -1198,13 +1196,15 @@ get_rq:
 	init_request_from_bio(req, bio);
 
 	spin_lock_irq(q->queue_lock);
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+	    bio_flagged(bio, BIO_CPU_AFFINE))
+		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
 	if (sync)
 		__generic_unplug_device(q);
-
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d70692badcdb..a60e959a12c4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-static int __init blk_settings_init(void)
+int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 9e1c43bff662..3a1af551191e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -13,6 +13,70 @@
 
 static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
 
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+	struct list_head *cpu_list, local_list;
+
+	local_irq_disable();
+	cpu_list = &__get_cpu_var(blk_cpu_done);
+	list_replace_init(cpu_list, &local_list);
+	local_irq_enable();
+
+	while (!list_empty(&local_list)) {
+		struct request *rq;
+
+		rq = list_entry(local_list.next, struct request, csd.list);
+		list_del_init(&rq->csd.list);
+		rq->q->softirq_done_fn(rq);
+	}
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+static void trigger_softirq(void *data)
+{
+	struct request *rq = data;
+	unsigned long flags;
+	struct list_head *list;
+
+	local_irq_save(flags);
+	list = &__get_cpu_var(blk_cpu_done);
+	list_add_tail(&rq->csd.list, list);
+
+	if (list->next == &rq->csd.list)
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Setup and invoke a run of 'trigger_softirq' on the given cpu.
+ */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+	if (cpu_online(cpu)) {
+		struct call_single_data *data = &rq->csd;
+
+		data->func = trigger_softirq;
+		data->info = rq;
+		data->flags = 0;
+
+		__smp_call_function_single(cpu, data);
+		return 0;
+	}
+
+	return 1;
+}
+#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+	return 1;
+}
+#endif
+
 static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@@ -33,33 +97,10 @@ static int __cpuinit blk_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
+static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
-/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-	struct list_head *cpu_list, local_list;
-
-	local_irq_disable();
-	cpu_list = &__get_cpu_var(blk_cpu_done);
-	list_replace_init(cpu_list, &local_list);
-	local_irq_enable();
-
-	while (!list_empty(&local_list)) {
-		struct request *rq;
-
-		rq = list_entry(local_list.next, struct request, donelist);
-		list_del_init(&rq->donelist);
-		rq->q->softirq_done_fn(rq);
-	}
-}
-
 /**
  * blk_complete_request - end I/O on a request
  * @req:      the request being processed
@@ -71,25 +112,48 @@ static void blk_done_softirq(struct softirq_action *h)
  *     through a softirq handler. The user must have registered a completion
  *     callback through blk_queue_softirq_done().
  **/
-
 void blk_complete_request(struct request *req)
 {
-	struct list_head *cpu_list;
+	struct request_queue *q = req->q;
 	unsigned long flags;
+	int ccpu, cpu, group_cpu;
 
-	BUG_ON(!req->q->softirq_done_fn);
+	BUG_ON(!q->softirq_done_fn);
 
 	local_irq_save(flags);
+	cpu = smp_processor_id();
+	group_cpu = blk_cpu_to_group(cpu);
 
-	cpu_list = &__get_cpu_var(blk_cpu_done);
-	list_add_tail(&req->donelist, cpu_list);
-	raise_softirq_irqoff(BLOCK_SOFTIRQ);
+	/*
+	 * Select completion CPU
+	 */
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+		ccpu = req->cpu;
+	else
+		ccpu = cpu;
+
+	if (ccpu == cpu || ccpu == group_cpu) {
+		struct list_head *list;
+do_local:
+		list = &__get_cpu_var(blk_cpu_done);
+		list_add_tail(&req->csd.list, list);
+
+		/*
+		 * if the list only contains our just added request,
+		 * signal a raise of the softirq. If there are already
+		 * entries there, someone already raised the irq but it
+		 * hasn't run yet.
+		 */
+		if (list->next == &req->csd.list)
+			raise_softirq_irqoff(BLOCK_SOFTIRQ);
+	} else if (raise_blk_irq(ccpu, req))
+		goto do_local;
 
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_complete_request);
 
-int __init blk_softirq_init(void)
+__init int blk_softirq_init(void)
 {
 	int i;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b9a6ed166649..21e275d7eed9 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+{
+	unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+
+	return queue_var_show(set != 0, page);
+}
+
+static ssize_t
+queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+{
+	ssize_t ret = -EINVAL;
+#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+	unsigned long val;
+
+	ret = queue_var_store(&val, page, count);
+	spin_lock_irq(q->queue_lock);
+	if (val)
+		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+	spin_unlock_irq(q->queue_lock);
+#endif
+	return ret;
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
 	.store = queue_nomerges_store,
 };
 
+static struct queue_sysfs_entry queue_rq_affinity_entry = {
+	.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_rq_affinity_show,
+	.store = queue_rq_affinity_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = {
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_nomerges_entry.attr,
+	&queue_rq_affinity_entry.attr,
 	NULL,
 };
 
diff --git a/block/blk.h b/block/blk.h
index c79f30e1df52..de74254cb916 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -59,4 +59,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 
 #endif /* BLK_DEV_INTEGRITY */
 
+static inline int blk_cpu_to_group(int cpu)
+{
+#ifdef CONFIG_SCHED_MC
+	cpumask_t mask = cpu_coregroup_map(cpu);
+	return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+	return first_cpu(per_cpu(cpu_sibling_map, cpu));
+#else
+	return cpu;
+#endif
+}
+
 #endif
diff --git a/fs/bio.c b/fs/bio.c
index bee4deca774a..6a637b5c24b5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
+	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2c0c09034fd2..13aba20edb2d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -81,6 +81,8 @@ struct bio {
 
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 
+	unsigned int		bi_comp_cpu;	/* completion CPU */
+
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
 	bio_end_io_t		*bi_end_io;
@@ -105,6 +107,7 @@ struct bio {
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
+#define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -342,6 +345,14 @@ void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
+/*
+ * Allow queuer to specify a completion CPU for this bio
+ */
+static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
+{
+	bio->bi_comp_cpu = cpu;
+}
+
 /*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 10aa46c8f170..93204bf7b297 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/bsg.h>
+#include <linux/smp.h>
 
 #include <asm/scatterlist.h>
 
@@ -139,7 +140,8 @@ enum rq_flag_bits {
  */
 struct request {
 	struct list_head queuelist;
-	struct list_head donelist;
+	struct call_single_data csd;
+	int cpu;
 
 	struct request_queue *q;
 
@@ -420,6 +422,7 @@ struct request_queue
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
+#define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 639624b55fbe..bb791c311a56 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -173,15 +173,15 @@ enum {
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
 /*
- * Hack to reuse the donelist list_head as the fifo time holder while
+ * Hack to reuse the csd.list list_head as the fifo time holder while
  * the request is in the io scheduler. Saves an unsigned long in rq.
  */
-#define rq_fifo_time(rq)	((unsigned long) (rq)->donelist.next)
-#define rq_set_fifo_time(rq,exp)	((rq)->donelist.next = (void *) (exp))
+#define rq_fifo_time(rq)	((unsigned long) (rq)->csd.list.next)
+#define rq_set_fifo_time(rq,exp)	((rq)->csd.list.next = (void *) (exp))
 #define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)	do {		\
 	list_del_init(&(rq)->queuelist);	\
-	INIT_LIST_HEAD(&(rq)->donelist);	\
+	INIT_LIST_HEAD(&(rq)->csd.list);	\
 	} while (0)
 
 /*

From ab780f1ece0dc8d5e8e8e85435acc5e4747ccda3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 26 Aug 2008 10:25:02 +0200
Subject: [PATCH 059/132] block: inherit CPU completion on bio->rq and rq->rq
 merges

Somewhat incomplete, as we do allow merges of requests and bios
that have different completion CPUs given. This is done on the
assumption that a larger IO is still more beneficial than CPU
locality.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 4 ++++
 block/blk-merge.c      | 2 ++
 include/linux/blkdev.h | 1 +
 3 files changed, 7 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5484838f46e7..b9a252cae4df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1134,6 +1134,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		req->biotail = bio;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
+		if (!blk_rq_cpu_valid(req))
+			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
@@ -1161,6 +1163,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		req->sector = req->hard_sector = bio->bi_sector;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
+		if (!blk_rq_cpu_valid(req))
+			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c77196d55899..908d3e11ac52 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -400,6 +400,8 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	}
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+	if (blk_rq_cpu_valid(next))
+		req->cpu = next->cpu;
 
 	__blk_put_request(q, next);
 	return 1;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 93204bf7b297..12df8efeef19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -545,6 +545,7 @@ enum {
 #define blk_pm_request(rq)	\
 	(blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
 
+#define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)

From 605401618ce4409045bc4db86e88d4b38f2ad585 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 26 Aug 2008 13:34:34 +0200
Subject: [PATCH 060/132] block: don't use bio_has_data() in the completion
 path

We should just check for rq->bio, as that is really the information
we are looking for. Even if the bio attached doesn't carry data,
we still need to do IO post processing on it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b9a252cae4df..5bf806adc770 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1817,7 +1817,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (bio_has_data(rq->bio) || blk_discard_rq(rq)) {
+	if (rq->bio) {
 		if (__end_that_request_first(rq, error, nr_bytes))
 			return 1;
 
@@ -1875,8 +1875,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	if ((bio_has_data(rq->bio) || blk_discard_rq(rq)) &&
-	    __end_that_request_first(rq, error, nr_bytes))
+	if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
 		return 1;
 
 	add_disk_randomness(rq->rq_disk);

From 45333d5a31296d0af886d94f1d08f128231cab8e Mon Sep 17 00:00:00 2001
From: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Date: Tue, 26 Aug 2008 15:52:36 +0200
Subject: [PATCH 061/132] cfq-iosched: fix queue depth detection

CFQ's detection of queueing devices assumes a non-queuing device and detects
if the queue depth reaches a certain threshold.  Under some workloads (e.g.
synchronous reads), CFQ effectively forces a unit queue depth, thus defeating
the detection logic.  This leads to poor performance on queuing hardware,
since the idle window remains enabled.

This patch inverts the sense of the logic: assume a queuing-capable device,
and detect if the depth does not exceed the threshold.

Signed-off-by: Aaron Carroll <aaronc@gelato.unsw.edu.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 47 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5f6fd287c185..494b6fdcb183 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -39,6 +39,7 @@ static int cfq_slice_idle = HZ / 125;
 #define CFQ_MIN_TT		(2)
 
 #define CFQ_SLICE_SCALE		(5)
+#define CFQ_HW_QUEUE_MIN	(5)
 
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
@@ -86,7 +87,14 @@ struct cfq_data {
 
 	int rq_in_driver;
 	int sync_flight;
+
+	/*
+	 * queue-depth detection
+	 */
+	int rq_queued;
 	int hw_tag;
+	int hw_tag_samples;
+	int rq_in_driver_peak;
 
 	/*
 	 * idle window management
@@ -654,15 +662,6 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 
-	/*
-	 * If the depth is larger 1, it really could be queueing. But lets
-	 * make the mark a little higher - idling could still be good for
-	 * low queueing, and a low queueing number could also just indicate
-	 * a SCSI mid layer like behaviour where limit+1 is often seen.
-	 */
-	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
-		cfqd->hw_tag = 1;
-
 	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
 }
 
@@ -686,6 +685,7 @@ static void cfq_remove_request(struct request *rq)
 	list_del_init(&rq->queuelist);
 	cfq_del_rq_rb(rq);
 
+	cfqq->cfqd->rq_queued--;
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
@@ -1833,6 +1833,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
+	cfqd->rq_queued++;
 	if (rq_is_meta(rq))
 		cfqq->meta_pending++;
 
@@ -1880,6 +1881,31 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void cfq_update_hw_tag(struct cfq_data *cfqd)
+{
+	if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
+		cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+
+	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+		return;
+
+	if (cfqd->hw_tag_samples++ < 50)
+		return;
+
+	if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+		cfqd->hw_tag = 1;
+	else
+		cfqd->hw_tag = 0;
+
+	cfqd->hw_tag_samples = 0;
+	cfqd->rq_in_driver_peak = 0;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -1890,6 +1916,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	now = jiffies;
 	cfq_log_cfqq(cfqd, cfqq, "complete");
 
+	cfq_update_hw_tag(cfqd);
+
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
@@ -2200,6 +2228,7 @@ static void *cfq_init_queue(struct request_queue *q)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->hw_tag = 1;
 
 	return cfqd;
 }

From a3bce90edd8f6cafe3f63b1a943800792e830178 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:05 +0900
Subject: [PATCH 062/132] block: add gfp_mask argument to blk_rq_map_user and
 blk_rq_map_user_iov

Currently, blk_rq_map_user and blk_rq_map_user_iov always do
GFP_KERNEL allocation.

This adds gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov
so sg can use it (sg always does GFP_ATOMIC allocation).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c             | 20 ++++++++++++--------
 block/bsg.c                 |  5 +++--
 block/scsi_ioctl.c          |  5 +++--
 drivers/cdrom/cdrom.c       |  2 +-
 drivers/scsi/scsi_tgt_lib.c |  2 +-
 fs/bio.c                    | 33 +++++++++++++++++++--------------
 include/linux/bio.h         |  9 +++++----
 include/linux/blkdev.h      |  5 +++--
 8 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index ea1bf53929e4..ac21b7397e15 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -41,7 +41,8 @@ static int __blk_rq_unmap_user(struct bio *bio)
 }
 
 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
-			     void __user *ubuf, unsigned int len)
+			     void __user *ubuf, unsigned int len,
+			     gfp_t gfp_mask)
 {
 	unsigned long uaddr;
 	unsigned int alignment;
@@ -57,9 +58,9 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	uaddr = (unsigned long) ubuf;
 	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 	if (!(uaddr & alignment) && !(len & alignment))
-		bio = bio_map_user(q, NULL, uaddr, len, reading);
+		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
-		bio = bio_copy_user(q, uaddr, len, reading);
+		bio = bio_copy_user(q, uaddr, len, reading, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
@@ -90,6 +91,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
  * @rq:		request structure to fill
  * @ubuf:	the user buffer
  * @len:	length of user data
+ * @gfp_mask:	memory allocation flags
  *
  * Description:
  *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
@@ -105,7 +107,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
  *    unmapping.
  */
 int blk_rq_map_user(struct request_queue *q, struct request *rq,
-		    void __user *ubuf, unsigned long len)
+		    void __user *ubuf, unsigned long len, gfp_t gfp_mask)
 {
 	unsigned long bytes_read = 0;
 	struct bio *bio = NULL;
@@ -132,7 +134,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		if (end - start > BIO_MAX_PAGES)
 			map_len -= PAGE_SIZE;
 
-		ret = __blk_rq_map_user(q, rq, ubuf, map_len);
+		ret = __blk_rq_map_user(q, rq, ubuf, map_len, gfp_mask);
 		if (ret < 0)
 			goto unmap_rq;
 		if (!bio)
@@ -160,6 +162,7 @@ EXPORT_SYMBOL(blk_rq_map_user);
  * @iov:	pointer to the iovec
  * @iov_count:	number of elements in the iovec
  * @len:	I/O byte count
+ * @gfp_mask:	memory allocation flags
  *
  * Description:
  *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
@@ -175,7 +178,8 @@ EXPORT_SYMBOL(blk_rq_map_user);
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-			struct sg_iovec *iov, int iov_count, unsigned int len)
+			struct sg_iovec *iov, int iov_count, unsigned int len,
+			gfp_t gfp_mask)
 {
 	struct bio *bio;
 	int i, read = rq_data_dir(rq) == READ;
@@ -194,9 +198,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	}
 
 	if (unaligned || (q->dma_pad_mask & len))
-		bio = bio_copy_user_iov(q, iov, iov_count, read);
+		bio = bio_copy_user_iov(q, iov, iov_count, read, gfp_mask);
 	else
-		bio = bio_map_user_iov(q, NULL, iov, iov_count, read);
+		bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
diff --git a/block/bsg.c b/block/bsg.c
index 0aae8d7ba99c..e7a142e9916c 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -283,7 +283,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
 		next_rq->cmd_type = rq->cmd_type;
 
 		dxferp = (void*)(unsigned long)hdr->din_xferp;
-		ret =  blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len);
+		ret =  blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len,
+				       GFP_KERNEL);
 		if (ret)
 			goto out;
 	}
@@ -298,7 +299,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
 		dxfer_len = 0;
 
 	if (dxfer_len) {
-		ret = blk_rq_map_user(q, rq, dxferp, dxfer_len);
+		ret = blk_rq_map_user(q, rq, dxferp, dxfer_len, GFP_KERNEL);
 		if (ret)
 			goto out;
 	}
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 3aab80a4c484..f49d6a11a69e 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -315,10 +315,11 @@ static int sg_io(struct file *file, struct request_queue *q,
 		}
 
 		ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count,
-					  hdr->dxfer_len);
+					  hdr->dxfer_len, GFP_KERNEL);
 		kfree(iov);
 	} else if (hdr->dxfer_len)
-		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len);
+		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len,
+				      GFP_KERNEL);
 
 	if (ret)
 		goto out;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 74031de517e6..e861d24a6d32 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2097,7 +2097,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		ret = blk_rq_map_user(q, rq, ubuf, len);
+		ret = blk_rq_map_user(q, rq, ubuf, len, GFP_KERNEL);
 		if (ret)
 			break;
 
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 257e097c39af..2a4fd820d616 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -362,7 +362,7 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 	int err;
 
 	dprintk("%lx %u\n", uaddr, len);
-	err = blk_rq_map_user(q, rq, (void *)uaddr, len);
+	err = blk_rq_map_user(q, rq, (void *)uaddr, len, GFP_KERNEL);
 	if (err) {
 		/*
 		 * TODO: need to fixup sg_tablesize, max_segment_size,
diff --git a/fs/bio.c b/fs/bio.c
index 6a637b5c24b5..3d2e9ad24728 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -558,13 +558,14 @@ int bio_uncopy_user(struct bio *bio)
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
 struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-			      int iov_count, int write_to_vm)
+			      int iov_count, int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
@@ -587,12 +588,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 		len += iov[i].iov_len;
 	}
 
-	bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);
+	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
 	if (!bmd)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		goto out_bmd;
 
@@ -605,7 +606,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+		page = alloc_page(q->bounce_gfp | gfp_mask);
 		if (!page) {
 			ret = -ENOMEM;
 			break;
@@ -647,26 +648,27 @@ out_bmd:
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Prepares and returns a bio for indirect user io, bouncing data
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
 struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm)
+			  unsigned int len, int write_to_vm, gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+	return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask);
 }
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
 				      struct sg_iovec *iov, int iov_count,
-				      int write_to_vm)
+				      int write_to_vm, gfp_t gfp_mask)
 {
 	int i, j;
 	int nr_pages = 0;
@@ -692,12 +694,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
 	if (!nr_pages)
 		return ERR_PTR(-EINVAL);
 
-	bio = bio_alloc(GFP_KERNEL, nr_pages);
+	bio = bio_alloc(gfp_mask, nr_pages);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
 	if (!pages)
 		goto out;
 
@@ -776,19 +778,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm)
+			 unsigned long uaddr, unsigned int len, int write_to_vm,
+			 gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
 
 /**
@@ -798,18 +802,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
+ *	@gfp_mask: memory allocation flags
  *
  *	Map the user space address into a bio suitable for io to a block
  *	device. Returns an error pointer in case of error.
  */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
 			     struct sg_iovec *iov, int iov_count,
-			     int write_to_vm)
+			     int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio *bio;
 
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
-
+	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+				 gfp_mask);
 	if (IS_ERR(bio))
 		return bio;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 13aba20edb2d..200b185c3e83 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -325,11 +325,11 @@ extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
-				unsigned long, unsigned int, int);
+				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
-				    struct sg_iovec *, int, int);
+				    struct sg_iovec *, int, int, gfp_t);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
@@ -337,9 +337,10 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 				 gfp_t, int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
-extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
+extern struct bio *bio_copy_user(struct request_queue *, unsigned long,
+				 unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
-				     int, int);
+				     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12df8efeef19..00e388d0e221 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -710,11 +710,12 @@ extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_start_queueing(struct request_queue *);
-extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long);
+extern int blk_rq_map_user(struct request_queue *, struct request *,
+			   void __user *, unsigned long, gfp_t);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct sg_iovec *, int, unsigned int);
+			       struct sg_iovec *, int, unsigned int, gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,

From 152e283fdfea0cd11e297d982378b55937842dde Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:06 +0900
Subject: [PATCH 063/132] block: introduce struct rq_map_data to use reserved
 pages

This patch introduces struct rq_map_data to enable bio_copy_use_iov()
use reserved pages.

Currently, bio_copy_user_iov allocates bounce pages but
drivers/scsi/sg.c wants to allocate pages by itself and use
them. struct rq_map_data can be used to pass allocated pages to
bio_copy_user_iov.

The current users of bio_copy_user_iov simply passes NULL (they don't
want to use pre-allocated pages).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c             | 26 ++++++++++-------
 block/bsg.c                 |  7 +++--
 block/scsi_ioctl.c          |  4 +--
 drivers/cdrom/cdrom.c       |  2 +-
 drivers/scsi/scsi_tgt_lib.c |  2 +-
 fs/bio.c                    | 58 ++++++++++++++++++++++++++-----------
 include/linux/bio.h         |  8 +++--
 include/linux/blkdev.h      | 12 ++++++--
 8 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index ac21b7397e15..dad6a2907835 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -41,8 +41,8 @@ static int __blk_rq_unmap_user(struct bio *bio)
 }
 
 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
-			     void __user *ubuf, unsigned int len,
-			     gfp_t gfp_mask)
+			     struct rq_map_data *map_data, void __user *ubuf,
+			     unsigned int len, gfp_t gfp_mask)
 {
 	unsigned long uaddr;
 	unsigned int alignment;
@@ -57,10 +57,10 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 */
 	uaddr = (unsigned long) ubuf;
 	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	if (!(uaddr & alignment) && !(len & alignment))
+	if (!(uaddr & alignment) && !(len & alignment) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
-		bio = bio_copy_user(q, uaddr, len, reading, gfp_mask);
+		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
@@ -89,6 +89,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
  * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request structure to fill
+ * @map_data:   pointer to the rq_map_data holding pages (if necessary)
  * @ubuf:	the user buffer
  * @len:	length of user data
  * @gfp_mask:	memory allocation flags
@@ -107,7 +108,8 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
  *    unmapping.
  */
 int blk_rq_map_user(struct request_queue *q, struct request *rq,
-		    void __user *ubuf, unsigned long len, gfp_t gfp_mask)
+		    struct rq_map_data *map_data, void __user *ubuf,
+		    unsigned long len, gfp_t gfp_mask)
 {
 	unsigned long bytes_read = 0;
 	struct bio *bio = NULL;
@@ -134,7 +136,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		if (end - start > BIO_MAX_PAGES)
 			map_len -= PAGE_SIZE;
 
-		ret = __blk_rq_map_user(q, rq, ubuf, map_len, gfp_mask);
+		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
+					gfp_mask);
 		if (ret < 0)
 			goto unmap_rq;
 		if (!bio)
@@ -159,6 +162,7 @@ EXPORT_SYMBOL(blk_rq_map_user);
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
+ * @map_data:   pointer to the rq_map_data holding pages (if necessary)
  * @iov:	pointer to the iovec
  * @iov_count:	number of elements in the iovec
  * @len:	I/O byte count
@@ -178,8 +182,8 @@ EXPORT_SYMBOL(blk_rq_map_user);
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
-			struct sg_iovec *iov, int iov_count, unsigned int len,
-			gfp_t gfp_mask)
+			struct rq_map_data *map_data, struct sg_iovec *iov,
+			int iov_count, unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
 	int i, read = rq_data_dir(rq) == READ;
@@ -197,8 +201,9 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		}
 	}
 
-	if (unaligned || (q->dma_pad_mask & len))
-		bio = bio_copy_user_iov(q, iov, iov_count, read, gfp_mask);
+	if (unaligned || (q->dma_pad_mask & len) || map_data)
+		bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
+					gfp_mask);
 	else
 		bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
 
@@ -220,6 +225,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	rq->buffer = rq->data = NULL;
 	return 0;
 }
+EXPORT_SYMBOL(blk_rq_map_user_iov);
 
 /**
  * blk_rq_unmap_user - unmap a request with user data
diff --git a/block/bsg.c b/block/bsg.c
index e7a142e9916c..56cb343c76d8 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -283,8 +283,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
 		next_rq->cmd_type = rq->cmd_type;
 
 		dxferp = (void*)(unsigned long)hdr->din_xferp;
-		ret =  blk_rq_map_user(q, next_rq, dxferp, hdr->din_xfer_len,
-				       GFP_KERNEL);
+		ret =  blk_rq_map_user(q, next_rq, NULL, dxferp,
+				       hdr->din_xfer_len, GFP_KERNEL);
 		if (ret)
 			goto out;
 	}
@@ -299,7 +299,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, int has_write_perm)
 		dxfer_len = 0;
 
 	if (dxfer_len) {
-		ret = blk_rq_map_user(q, rq, dxferp, dxfer_len, GFP_KERNEL);
+		ret = blk_rq_map_user(q, rq, NULL, dxferp, dxfer_len,
+				      GFP_KERNEL);
 		if (ret)
 			goto out;
 	}
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index f49d6a11a69e..c34272a348fe 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -314,11 +314,11 @@ static int sg_io(struct file *file, struct request_queue *q,
 			goto out;
 		}
 
-		ret = blk_rq_map_user_iov(q, rq, iov, hdr->iovec_count,
+		ret = blk_rq_map_user_iov(q, rq, NULL, iov, hdr->iovec_count,
 					  hdr->dxfer_len, GFP_KERNEL);
 		kfree(iov);
 	} else if (hdr->dxfer_len)
-		ret = blk_rq_map_user(q, rq, hdr->dxferp, hdr->dxfer_len,
+		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
 				      GFP_KERNEL);
 
 	if (ret)
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index e861d24a6d32..d47f2f80accd 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2097,7 +2097,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 
 		len = nr * CD_FRAMESIZE_RAW;
 
-		ret = blk_rq_map_user(q, rq, ubuf, len, GFP_KERNEL);
+		ret = blk_rq_map_user(q, rq, NULL, ubuf, len, GFP_KERNEL);
 		if (ret)
 			break;
 
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 2a4fd820d616..3117bb106b5d 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -362,7 +362,7 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 	int err;
 
 	dprintk("%lx %u\n", uaddr, len);
-	err = blk_rq_map_user(q, rq, (void *)uaddr, len, GFP_KERNEL);
+	err = blk_rq_map_user(q, rq, NULL, (void *)uaddr, len, GFP_KERNEL);
 	if (err) {
 		/*
 		 * TODO: need to fixup sg_tablesize, max_segment_size,
diff --git a/fs/bio.c b/fs/bio.c
index 3d2e9ad24728..a2f072647cdf 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -439,16 +439,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 
 struct bio_map_data {
 	struct bio_vec *iovecs;
-	int nr_sgvecs;
 	struct sg_iovec *sgvecs;
+	int nr_sgvecs;
+	int is_our_pages;
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     struct sg_iovec *iov, int iov_count)
+			     struct sg_iovec *iov, int iov_count,
+			     int is_our_pages)
 {
 	memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
 	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
 	bmd->nr_sgvecs = iov_count;
+	bmd->is_our_pages = is_our_pages;
 	bio->bi_private = bmd;
 }
 
@@ -483,7 +486,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
 }
 
 static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
-			  struct sg_iovec *iov, int iov_count, int uncopy)
+			  struct sg_iovec *iov, int iov_count, int uncopy,
+			  int do_free_page)
 {
 	int ret = 0, i;
 	struct bio_vec *bvec;
@@ -526,7 +530,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 			}
 		}
 
-		if (uncopy)
+		if (do_free_page)
 			__free_page(bvec->bv_page);
 	}
 
@@ -545,7 +549,8 @@ int bio_uncopy_user(struct bio *bio)
 	struct bio_map_data *bmd = bio->bi_private;
 	int ret;
 
-	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);
+	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1,
+			     bmd->is_our_pages);
 
 	bio_free_map_data(bmd);
 	bio_put(bio);
@@ -555,6 +560,7 @@ int bio_uncopy_user(struct bio *bio)
 /**
  *	bio_copy_user_iov	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@iov:	the iovec.
  *	@iov_count: number of elements in the iovec
  *	@write_to_vm: bool indicating writing to pages or not
@@ -564,8 +570,10 @@ int bio_uncopy_user(struct bio *bio)
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
-			      int iov_count, int write_to_vm, gfp_t gfp_mask)
+struct bio *bio_copy_user_iov(struct request_queue *q,
+			      struct rq_map_data *map_data,
+			      struct sg_iovec *iov, int iov_count,
+			      int write_to_vm, gfp_t gfp_mask)
 {
 	struct bio_map_data *bmd;
 	struct bio_vec *bvec;
@@ -600,13 +608,26 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
+	i = 0;
 	while (len) {
-		unsigned int bytes = PAGE_SIZE;
+		unsigned int bytes;
+
+		if (map_data)
+			bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+		else
+			bytes = PAGE_SIZE;
 
 		if (bytes > len)
 			bytes = len;
 
-		page = alloc_page(q->bounce_gfp | gfp_mask);
+		if (map_data) {
+			if (i == map_data->nr_entries) {
+				ret = -ENOMEM;
+				break;
+			}
+			page = map_data->pages[i++];
+		} else
+			page = alloc_page(q->bounce_gfp | gfp_mask);
 		if (!page) {
 			ret = -ENOMEM;
 			break;
@@ -625,16 +646,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
 	 * success
 	 */
 	if (!write_to_vm) {
-		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);
+		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
 	}
 
-	bio_set_map_data(bmd, bio, iov, iov_count);
+	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
 	return bio;
 cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
+	if (!map_data)
+		bio_for_each_segment(bvec, bio, i)
+			__free_page(bvec->bv_page);
 
 	bio_put(bio);
 out_bmd:
@@ -645,6 +667,7 @@ out_bmd:
 /**
  *	bio_copy_user	-	copy user data to bio
  *	@q: destination block queue
+ *	@map_data: pointer to the rq_map_data holding pages (if necessary)
  *	@uaddr: start of user address
  *	@len: length in bytes
  *	@write_to_vm: bool indicating writing to pages or not
@@ -654,15 +677,16 @@ out_bmd:
  *	to/from kernel pages as necessary. Must be paired with
  *	call bio_uncopy_user() on io completion.
  */
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
-			  unsigned int len, int write_to_vm, gfp_t gfp_mask)
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
+			  unsigned long uaddr, unsigned int len,
+			  int write_to_vm, gfp_t gfp_mask)
 {
 	struct sg_iovec iov;
 
 	iov.iov_base = (void __user *)uaddr;
 	iov.iov_len = len;
 
-	return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask);
+	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
@@ -1028,7 +1052,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	bio->bi_private = bmd;
 	bio->bi_end_io = bio_copy_kern_endio;
 
-	bio_set_map_data(bmd, bio, &iov, 1);
+	bio_set_map_data(bmd, bio, &iov, 1, 1);
 	return bio;
 cleanup:
 	bio_for_each_segment(bvec, bio, i)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 200b185c3e83..bc386cd5e996 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -327,6 +327,7 @@ extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;
+struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
 				    struct sg_iovec *, int, int, gfp_t);
@@ -337,9 +338,10 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 				 gfp_t, int);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
-extern struct bio *bio_copy_user(struct request_queue *, unsigned long,
-				 unsigned int, int, gfp_t);
-extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
+extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
+				 unsigned long, unsigned int, int, gfp_t);
+extern struct bio *bio_copy_user_iov(struct request_queue *,
+				     struct rq_map_data *, struct sg_iovec *,
 				     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 00e388d0e221..358ac423ed2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -642,6 +642,12 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 }
 #endif /* CONFIG_MMU */
 
+struct rq_map_data {
+	struct page **pages;
+	int page_order;
+	int nr_entries;
+};
+
 struct req_iterator {
 	int i;
 	struct bio *bio;
@@ -711,11 +717,13 @@ extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
-			   void __user *, unsigned long, gfp_t);
+			   struct rq_map_data *, void __user *, unsigned long,
+			   gfp_t);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
-			       struct sg_iovec *, int, unsigned int, gfp_t);
+			       struct rq_map_data *, struct sg_iovec *, int,
+			       unsigned int, gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,

From 10865dfa34e7552c4c64606edcdf1e21a110c985 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:07 +0900
Subject: [PATCH 064/132] sg: convert the non-data path to use the block layer

This patch converts the non data path to use the block layer functions
(blk_get_request, blk_execute_rq_nowait, etc) instead of uses
scsi_execute_async().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 53 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 661f9f21650a..487c7776cc4e 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -137,6 +137,7 @@ typedef struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
 	char orphan;		/* 1 -> drop on sight, 0 -> normal */
 	char sg_io_owned;	/* 1 -> packet belongs to SG_IO */
 	volatile char done;	/* 0->before bh, 1->before read, 2->read */
+	struct request *rq;
 } Sg_request;
 
 typedef struct sg_fd {		/* holds the state of a file descriptor */
@@ -176,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
 static int sg_fasync(int fd, struct file *filp, int mode);
 /* tasklet or soft irq callback */
 static void sg_cmd_done(void *data, char *sense, int result, int resid);
-static int sg_start_req(Sg_request * srp);
+static int sg_start_req(Sg_request *srp, unsigned char *cmd);
 static void sg_finish_rem_req(Sg_request * srp);
 static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
 static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp,
@@ -229,6 +230,11 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
 				  cmd, filp->f_mode & FMODE_WRITE);
 }
 
+static void sg_rq_end_io(struct request *rq, int uptodate)
+{
+	sg_cmd_done(rq->end_io_data, rq->sense, rq->errors, rq->data_len);
+}
+
 static int
 sg_open(struct inode *inode, struct file *filp)
 {
@@ -732,7 +738,8 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 	SCSI_LOG_TIMEOUT(4, printk("sg_common_write:  scsi opcode=0x%02x, cmd_size=%d\n",
 			  (int) cmnd[0], (int) hp->cmd_len));
 
-	if ((k = sg_start_req(srp))) {
+	k = sg_start_req(srp, cmnd);
+	if (k) {
 		SCSI_LOG_TIMEOUT(1, printk("sg_common_write: start_req err=%d\n", k));
 		sg_finish_rem_req(srp);
 		return k;	/* probably out of space --> ENOMEM */
@@ -765,6 +772,12 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 	hp->duration = jiffies_to_msecs(jiffies);
 /* Now send everything of to mid-level. The next time we hear about this
    packet is when sg_cmd_done() is called (i.e. a callback). */
+	if (srp->rq) {
+		srp->rq->timeout = timeout;
+		blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
+				      srp->rq, 1, sg_rq_end_io);
+		return 0;
+	}
 	if (scsi_execute_async(sdp->device, cmnd, hp->cmd_len, data_dir, srp->data.buffer,
 				hp->dxfer_len, srp->data.k_use_sg, timeout,
 				SG_DEFAULT_RETRIES, srp, sg_cmd_done,
@@ -1634,8 +1647,32 @@ exit_sg(void)
 	idr_destroy(&sg_index_idr);
 }
 
-static int
-sg_start_req(Sg_request * srp)
+static int __sg_start_req(struct sg_request *srp, struct sg_io_hdr *hp,
+			  unsigned char *cmd)
+{
+	struct sg_fd *sfp = srp->parentfp;
+	struct request_queue *q = sfp->parentdp->device->request_queue;
+	struct request *rq;
+	int rw = hp->dxfer_direction == SG_DXFER_TO_DEV ? WRITE : READ;
+
+	rq = blk_get_request(q, rw, GFP_ATOMIC);
+	if (!rq)
+		return -ENOMEM;
+
+	memcpy(rq->cmd, cmd, hp->cmd_len);
+
+	rq->cmd_len = hp->cmd_len;
+	rq->cmd_type = REQ_TYPE_BLOCK_PC;
+
+	srp->rq = rq;
+	rq->end_io_data = srp;
+	rq->sense = srp->sense_b;
+	rq->retries = SG_DEFAULT_RETRIES;
+
+	return 0;
+}
+
+static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 {
 	int res;
 	Sg_fd *sfp = srp->parentfp;
@@ -1646,8 +1683,10 @@ sg_start_req(Sg_request * srp)
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_start_req: dxfer_len=%d\n", dxfer_len));
+
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
-		return 0;
+		return __sg_start_req(srp, hp, cmd);
+
 	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
 	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
 	    (!sfp->parentdp->device->host->unchecked_isa_dma)) {
@@ -1678,6 +1717,10 @@ sg_finish_rem_req(Sg_request * srp)
 		sg_unlink_reserve(sfp, srp);
 	else
 		sg_remove_scat(req_schp);
+
+	if (srp->rq)
+		blk_put_request(srp->rq);
+
 	sg_remove_request(sfp, srp);
 }
 

From 6e5a30cba5e7c03b2cd564e968f1dd667a0f7c42 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 16:17:08 +0900
Subject: [PATCH 065/132] sg: convert the direct IO path to use the block layer

This patch converts the direct IO path (SG_FLAG_DIRECT_IO) to use the
block layer functions (blk_get_request, blk_execute_rq_nowait,
blk_rq_map_user, etc) instead of scsi_execute_async().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 173 ++++++++--------------------------------------
 1 file changed, 27 insertions(+), 146 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 487c7776cc4e..cb6de0752ee1 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -138,6 +138,7 @@ typedef struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
 	char sg_io_owned;	/* 1 -> packet belongs to SG_IO */
 	volatile char done;	/* 0->before bh, 1->before read, 2->read */
 	struct request *rq;
+	struct bio *bio;
 } Sg_request;
 
 typedef struct sg_fd {		/* holds the state of a file descriptor */
@@ -1679,21 +1680,29 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	sg_io_hdr_t *hp = &srp->header;
 	int dxfer_len = (int) hp->dxfer_len;
 	int dxfer_dir = hp->dxfer_direction;
+	unsigned long uaddr = (unsigned long)hp->dxferp;
 	Sg_scatter_hold *req_schp = &srp->data;
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
+	struct request_queue *q = sfp->parentdp->device->request_queue;
+	unsigned long alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_start_req: dxfer_len=%d\n", dxfer_len));
 
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return __sg_start_req(srp, hp, cmd);
 
+#ifdef SG_ALLOW_DIO_CODE
 	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
 	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
-	    (!sfp->parentdp->device->host->unchecked_isa_dma)) {
-		res = sg_build_direct(srp, sfp, dxfer_len);
-		if (res <= 0)	/* -ve -> error, 0 -> done, 1 -> try indirect */
-			return res;
+	    (!sfp->parentdp->device->host->unchecked_isa_dma) &&
+	    !(uaddr & alignment) && !(dxfer_len & alignment)) {
+		res = __sg_start_req(srp, hp, cmd);
+		if (!res)
+			res = sg_build_direct(srp, sfp, dxfer_len);
+
+		return res;
 	}
+#endif
 	if ((!sg_res_in_use(sfp)) && (dxfer_len <= rsv_schp->bufflen))
 		sg_link_reserve(sfp, srp, dxfer_len);
 	else {
@@ -1718,8 +1727,11 @@ sg_finish_rem_req(Sg_request * srp)
 	else
 		sg_remove_scat(req_schp);
 
-	if (srp->rq)
+	if (srp->rq) {
+		if (srp->bio)
+			blk_rq_unmap_user(srp->bio);
 		blk_put_request(srp->rq);
+	}
 
 	sg_remove_request(sfp, srp);
 }
@@ -1746,151 +1758,23 @@ sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize)
 	return tablesize;	/* number of scat_gath elements allocated */
 }
 
-#ifdef SG_ALLOW_DIO_CODE
-/* vvvvvvvv  following code borrowed from st driver's direct IO vvvvvvvvv */
-	/* TODO: hopefully we can use the generic block layer code */
-
-/* Pin down user pages and put them into a scatter gather list. Returns <= 0 if
-   - mapping of all pages not successful
-   (i.e., either completely successful or fails)
-*/
-static int 
-st_map_user_pages(struct scatterlist *sgl, const unsigned int max_pages, 
-	          unsigned long uaddr, size_t count, int rw)
-{
-	unsigned long end = (uaddr + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = uaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int res, i, j;
-	struct page **pages;
-
-	/* User attempted Overflow! */
-	if ((uaddr + count) < uaddr)
-		return -EINVAL;
-
-	/* Too big */
-        if (nr_pages > max_pages)
-		return -ENOMEM;
-
-	/* Hmm? */
-	if (count == 0)
-		return 0;
-
-	if ((pages = kmalloc(max_pages * sizeof(*pages), GFP_ATOMIC)) == NULL)
-		return -ENOMEM;
-
-        /* Try to fault in all of the necessary pages */
-	down_read(&current->mm->mmap_sem);
-        /* rw==READ means read from drive, write into memory area */
-	res = get_user_pages(
-		current,
-		current->mm,
-		uaddr,
-		nr_pages,
-		rw == READ,
-		0, /* don't force */
-		pages,
-		NULL);
-	up_read(&current->mm->mmap_sem);
-
-	/* Errors and no page mapped should return here */
-	if (res < nr_pages)
-		goto out_unmap;
-
-        for (i=0; i < nr_pages; i++) {
-                /* FIXME: flush superflous for rw==READ,
-                 * probably wrong function for rw==WRITE
-                 */
-		flush_dcache_page(pages[i]);
-		/* ?? Is locking needed? I don't think so */
-		/* if (!trylock_page(pages[i]))
-		   goto out_unlock; */
-        }
-
-	sg_set_page(sgl, pages[0], 0, uaddr & ~PAGE_MASK);
-	if (nr_pages > 1) {
-		sgl[0].length = PAGE_SIZE - sgl[0].offset;
-		count -= sgl[0].length;
-		for (i=1; i < nr_pages ; i++)
-			sg_set_page(&sgl[i], pages[i], count < PAGE_SIZE ? count : PAGE_SIZE, 0);
-	}
-	else {
-		sgl[0].length = count;
-	}
-
-	kfree(pages);
-	return nr_pages;
-
- out_unmap:
-	if (res > 0) {
-		for (j=0; j < res; j++)
-			page_cache_release(pages[j]);
-		res = 0;
-	}
-	kfree(pages);
-	return res;
-}
-
-
-/* And unmap them... */
-static int 
-st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
-		    int dirtied)
-{
-	int i;
-
-	for (i=0; i < nr_pages; i++) {
-		struct page *page = sg_page(&sgl[i]);
-
-		if (dirtied)
-			SetPageDirty(page);
-		/* unlock_page(page); */
-		/* FIXME: cache flush missing for rw==READ
-		 * FIXME: call the correct reference counting function
-		 */
-		page_cache_release(page);
-	}
-
-	return 0;
-}
-
-/* ^^^^^^^^  above code borrowed from st driver's direct IO ^^^^^^^^^ */
-#endif
-
-
 /* Returns: -ve -> error, 0 -> done, 1 -> try indirect */
 static int
 sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len)
 {
-#ifdef SG_ALLOW_DIO_CODE
 	sg_io_hdr_t *hp = &srp->header;
 	Sg_scatter_hold *schp = &srp->data;
-	int sg_tablesize = sfp->parentdp->sg_tablesize;
-	int mx_sc_elems, res;
-	struct scsi_device *sdev = sfp->parentdp->device;
+	int res;
+	struct request *rq = srp->rq;
+	struct request_queue *q = sfp->parentdp->device->request_queue;
 
-	if (((unsigned long)hp->dxferp &
-			queue_dma_alignment(sdev->request_queue)) != 0)
-		return 1;
-
-	mx_sc_elems = sg_build_sgat(schp, sfp, sg_tablesize);
-        if (mx_sc_elems <= 0) {
-                return 1;
-        }
-	res = st_map_user_pages(schp->buffer, mx_sc_elems,
-				(unsigned long)hp->dxferp, dxfer_len, 
-				(SG_DXFER_TO_DEV == hp->dxfer_direction) ? 1 : 0);
-	if (res <= 0) {
-		sg_remove_scat(schp);
-		return 1;
-	}
-	schp->k_use_sg = res;
+	res = blk_rq_map_user(q, rq, NULL, hp->dxferp, dxfer_len, GFP_ATOMIC);
+	if (res)
+		return res;
+	srp->bio = rq->bio;
 	schp->dio_in_use = 1;
 	hp->info |= SG_INFO_DIRECT_IO;
 	return 0;
-#else
-	return 1;
-#endif
 }
 
 static int
@@ -2069,11 +1953,7 @@ sg_remove_scat(Sg_scatter_hold * schp)
 	if (schp->buffer && (schp->sglist_len > 0)) {
 		struct scatterlist *sg = schp->buffer;
 
-		if (schp->dio_in_use) {
-#ifdef SG_ALLOW_DIO_CODE
-			st_unmap_user_pages(sg, schp->k_use_sg, TRUE);
-#endif
-		} else {
+		if (!schp->dio_in_use) {
 			int k;
 
 			for (k = 0; (k < schp->k_use_sg) && sg_page(sg);
@@ -2083,8 +1963,9 @@ sg_remove_scat(Sg_scatter_hold * schp)
 				    k, sg_page(sg), sg->length));
 				sg_page_free(sg_page(sg), sg->length);
 			}
+
+			kfree(schp->buffer);
 		}
-		kfree(schp->buffer);
 	}
 	memset(schp, 0, sizeof (*schp));
 }

From 10db10d144c0248f285242f79daf6b9de6b00a62 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 29 Aug 2008 12:32:18 +0200
Subject: [PATCH 066/132] sg: convert the indirect IO path to use the block
 layer

This patch converts the indirect IO path (including mmap IO and old
struct sg_header) to use the block layer functions (blk_get_request,
blk_execute_rq_nowait, blk_rq_map_user, etc) instead of
scsi_execute_async().

[Jens: fixed compile error with SCSI logging enabled]

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Douglas Gilbert <dougg@torque.net>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 397 ++++++++++++----------------------------------
 1 file changed, 105 insertions(+), 292 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index cb6de0752ee1..d6391666502c 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -47,7 +47,6 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #include <linux/seq_file.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
-#include <linux/scatterlist.h>
 #include <linux/blktrace_api.h>
 #include <linux/smp_lock.h>
 
@@ -119,7 +118,8 @@ typedef struct sg_scatter_hold { /* holding area for scsi scatter gather info */
 	unsigned sglist_len; /* size of malloc'd scatter-gather list ++ */
 	unsigned bufflen;	/* Size of (aggregate) data buffer */
 	unsigned b_malloc_len;	/* actual len malloc'ed in buffer */
-	struct scatterlist *buffer;/* scatter list */
+	struct page **pages;
+	int page_order;
 	char dio_in_use;	/* 0->indirect IO (or mmap), 1->dio */
 	unsigned char cmd_opcode; /* first byte of command */
 } Sg_scatter_hold;
@@ -190,8 +190,6 @@ static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
 			int read_only, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
-static int sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind,
-		      int wr_xf, int *countp, unsigned char __user **up);
 static int sg_write_xfer(Sg_request * srp);
 static int sg_read_xfer(Sg_request * srp);
 static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer);
@@ -199,8 +197,6 @@ static void sg_remove_scat(Sg_scatter_hold * schp);
 static void sg_build_reserve(Sg_fd * sfp, int req_size);
 static void sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size);
 static void sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp);
-static struct page *sg_page_malloc(int rqSz, int lowDma, int *retSzp);
-static void sg_page_free(struct page *page, int size);
 static Sg_fd *sg_add_sfp(Sg_device * sdp, int dev);
 static int sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
 static void __sg_remove_sfp(Sg_device * sdp, Sg_fd * sfp);
@@ -771,26 +767,11 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 		break;
 	}
 	hp->duration = jiffies_to_msecs(jiffies);
-/* Now send everything of to mid-level. The next time we hear about this
-   packet is when sg_cmd_done() is called (i.e. a callback). */
-	if (srp->rq) {
-		srp->rq->timeout = timeout;
-		blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
-				      srp->rq, 1, sg_rq_end_io);
-		return 0;
-	}
-	if (scsi_execute_async(sdp->device, cmnd, hp->cmd_len, data_dir, srp->data.buffer,
-				hp->dxfer_len, srp->data.k_use_sg, timeout,
-				SG_DEFAULT_RETRIES, srp, sg_cmd_done,
-				GFP_ATOMIC)) {
-		SCSI_LOG_TIMEOUT(1, printk("sg_common_write: scsi_execute_async failed\n"));
-		/*
-		 * most likely out of mem, but could also be a bad map
-		 */
-		sg_finish_rem_req(srp);
-		return -ENOMEM;
-	} else
-		return 0;
+
+	srp->rq->timeout = timeout;
+	blk_execute_rq_nowait(sdp->device->request_queue, sdp->disk,
+			      srp->rq, 1, sg_rq_end_io);
+	return 0;
 }
 
 static int
@@ -1206,8 +1187,7 @@ sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	Sg_fd *sfp;
 	unsigned long offset, len, sa;
 	Sg_scatter_hold *rsv_schp;
-	struct scatterlist *sg;
-	int k;
+	int k, length;
 
 	if ((NULL == vma) || (!(sfp = (Sg_fd *) vma->vm_private_data)))
 		return VM_FAULT_SIGBUS;
@@ -1217,15 +1197,14 @@ sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 	SCSI_LOG_TIMEOUT(3, printk("sg_vma_fault: offset=%lu, scatg=%d\n",
 				   offset, rsv_schp->k_use_sg));
-	sg = rsv_schp->buffer;
 	sa = vma->vm_start;
-	for (k = 0; (k < rsv_schp->k_use_sg) && (sa < vma->vm_end);
-	     ++k, sg = sg_next(sg)) {
+	length = 1 << (PAGE_SHIFT + rsv_schp->page_order);
+	for (k = 0; k < rsv_schp->k_use_sg && sa < vma->vm_end; k++) {
 		len = vma->vm_end - sa;
-		len = (len < sg->length) ? len : sg->length;
+		len = (len < length) ? len : length;
 		if (offset < len) {
-			struct page *page;
-			page = virt_to_page(page_address(sg_page(sg)) + offset);
+			struct page *page = nth_page(rsv_schp->pages[k],
+						     offset >> PAGE_SHIFT);
 			get_page(page);	/* increment page count */
 			vmf->page = page;
 			return 0; /* success */
@@ -1247,8 +1226,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 	Sg_fd *sfp;
 	unsigned long req_sz, len, sa;
 	Sg_scatter_hold *rsv_schp;
-	int k;
-	struct scatterlist *sg;
+	int k, length;
 
 	if ((!filp) || (!vma) || (!(sfp = (Sg_fd *) filp->private_data)))
 		return -ENXIO;
@@ -1262,11 +1240,10 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		return -ENOMEM;	/* cannot map more than reserved buffer */
 
 	sa = vma->vm_start;
-	sg = rsv_schp->buffer;
-	for (k = 0; (k < rsv_schp->k_use_sg) && (sa < vma->vm_end);
-	     ++k, sg = sg_next(sg)) {
+	length = 1 << (PAGE_SHIFT + rsv_schp->page_order);
+	for (k = 0; k < rsv_schp->k_use_sg && sa < vma->vm_end; k++) {
 		len = vma->vm_end - sa;
-		len = (len < sg->length) ? len : sg->length;
+		len = (len < length) ? len : length;
 		sa += len;
 	}
 
@@ -1310,7 +1287,6 @@ sg_cmd_done(void *data, char *sense, int result, int resid)
 	if (0 != result) {
 		struct scsi_sense_hdr sshdr;
 
-		memcpy(srp->sense_b, sense, sizeof (srp->sense_b));
 		srp->header.status = 0xff & result;
 		srp->header.masked_status = status_byte(result);
 		srp->header.msg_status = msg_byte(result);
@@ -1685,34 +1661,51 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
 	struct request_queue *q = sfp->parentdp->device->request_queue;
 	unsigned long alignment = queue_dma_alignment(q) | q->dma_pad_mask;
+	struct rq_map_data map_data;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_start_req: dxfer_len=%d\n", dxfer_len));
 
+	res = __sg_start_req(srp, hp, cmd);
+	if (res)
+		return res;
+
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
-		return __sg_start_req(srp, hp, cmd);
+		return 0;
 
 #ifdef SG_ALLOW_DIO_CODE
 	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
 	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
 	    (!sfp->parentdp->device->host->unchecked_isa_dma) &&
-	    !(uaddr & alignment) && !(dxfer_len & alignment)) {
-		res = __sg_start_req(srp, hp, cmd);
-		if (!res)
-			res = sg_build_direct(srp, sfp, dxfer_len);
-
-		return res;
-	}
+	    !(uaddr & alignment) && !(dxfer_len & alignment))
+		return sg_build_direct(srp, sfp, dxfer_len);
 #endif
 	if ((!sg_res_in_use(sfp)) && (dxfer_len <= rsv_schp->bufflen))
 		sg_link_reserve(sfp, srp, dxfer_len);
-	else {
+	else
 		res = sg_build_indirect(req_schp, sfp, dxfer_len);
-		if (res) {
-			sg_remove_scat(req_schp);
-			return res;
-		}
+
+	if (!res) {
+		struct request *rq = srp->rq;
+		Sg_scatter_hold *schp = &srp->data;
+		int iovec_count = (int) hp->iovec_count;
+
+		map_data.pages = schp->pages;
+		map_data.page_order = schp->page_order;
+		map_data.nr_entries = schp->k_use_sg;
+
+		if (iovec_count)
+			res = blk_rq_map_user_iov(q, rq, &map_data, hp->dxferp,
+						  iovec_count,
+						  hp->dxfer_len, GFP_ATOMIC);
+		else
+			res = blk_rq_map_user(q, rq, &map_data, hp->dxferp,
+					      hp->dxfer_len, GFP_ATOMIC);
+
+		if (!res)
+			srp->bio = rq->bio;
 	}
-	return 0;
+
+	return res;
 }
 
 static void
@@ -1730,6 +1723,7 @@ sg_finish_rem_req(Sg_request * srp)
 	if (srp->rq) {
 		if (srp->bio)
 			blk_rq_unmap_user(srp->bio);
+
 		blk_put_request(srp->rq);
 	}
 
@@ -1739,21 +1733,12 @@ sg_finish_rem_req(Sg_request * srp)
 static int
 sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize)
 {
-	int sg_bufflen = tablesize * sizeof(struct scatterlist);
+	int sg_bufflen = tablesize * sizeof(struct page *);
 	gfp_t gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
 
-	/*
-	 * TODO: test without low_dma, we should not need it since
-	 * the block layer will bounce the buffer for us
-	 *
-	 * XXX(hch): we shouldn't need GFP_DMA for the actual S/G list.
-	 */
-	if (sfp->low_dma)
-		 gfp_flags |= GFP_DMA;
-	schp->buffer = kzalloc(sg_bufflen, gfp_flags);
-	if (!schp->buffer)
+	schp->pages = kzalloc(sg_bufflen, gfp_flags);
+	if (!schp->pages)
 		return -ENOMEM;
-	sg_init_table(schp->buffer, tablesize);
 	schp->sglist_len = sg_bufflen;
 	return tablesize;	/* number of scat_gath elements allocated */
 }
@@ -1780,11 +1765,10 @@ sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len)
 static int
 sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 {
-	struct scatterlist *sg;
-	int ret_sz = 0, k, rem_sz, num, mx_sc_elems;
+	int ret_sz = 0, i, k, rem_sz, num, mx_sc_elems;
 	int sg_tablesize = sfp->parentdp->sg_tablesize;
-	int blk_size = buff_size;
-	struct page *p = NULL;
+	int blk_size = buff_size, order;
+	gfp_t gfp_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
 
 	if (blk_size < 0)
 		return -EFAULT;
@@ -1808,15 +1792,26 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 		} else
 			scatter_elem_sz_prev = num;
 	}
-	for (k = 0, sg = schp->buffer, rem_sz = blk_size;
-	     (rem_sz > 0) && (k < mx_sc_elems);
-	     ++k, rem_sz -= ret_sz, sg = sg_next(sg)) {
-		
+
+	if (sfp->low_dma)
+		gfp_mask |= GFP_DMA;
+
+	if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+		gfp_mask |= __GFP_ZERO;
+
+	order = get_order(num);
+retry:
+	ret_sz = 1 << (PAGE_SHIFT + order);
+
+	for (k = 0, rem_sz = blk_size; rem_sz > 0 && k < mx_sc_elems;
+	     k++, rem_sz -= ret_sz) {
+
 		num = (rem_sz > scatter_elem_sz_prev) ?
-		      scatter_elem_sz_prev : rem_sz;
-		p = sg_page_malloc(num, sfp->low_dma, &ret_sz);
-		if (!p)
-			return -ENOMEM;
+			scatter_elem_sz_prev : rem_sz;
+
+		schp->pages[k] = alloc_pages(gfp_mask, order);
+		if (!schp->pages[k])
+			goto out;
 
 		if (num == scatter_elem_sz_prev) {
 			if (unlikely(ret_sz > scatter_elem_sz_prev)) {
@@ -1824,12 +1819,12 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 				scatter_elem_sz_prev = ret_sz;
 			}
 		}
-		sg_set_page(sg, p, (ret_sz > num) ? num : ret_sz, 0);
 
 		SCSI_LOG_TIMEOUT(5, printk("sg_build_indirect: k=%d, num=%d, "
 				 "ret_sz=%d\n", k, num, ret_sz));
 	}		/* end of for loop */
 
+	schp->page_order = order;
 	schp->k_use_sg = k;
 	SCSI_LOG_TIMEOUT(5, printk("sg_build_indirect: k_use_sg=%d, "
 			 "rem_sz=%d\n", k, rem_sz));
@@ -1837,8 +1832,15 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 	schp->bufflen = blk_size;
 	if (rem_sz > 0)	/* must have failed */
 		return -ENOMEM;
-
 	return 0;
+out:
+	for (i = 0; i < k; i++)
+		__free_pages(schp->pages[k], order);
+
+	if (--order >= 0)
+		goto retry;
+
+	return -ENOMEM;
 }
 
 static int
@@ -1846,13 +1848,8 @@ sg_write_xfer(Sg_request * srp)
 {
 	sg_io_hdr_t *hp = &srp->header;
 	Sg_scatter_hold *schp = &srp->data;
-	struct scatterlist *sg = schp->buffer;
 	int num_xfer = 0;
-	int j, k, onum, usglen, ksglen, res;
-	int iovec_count = (int) hp->iovec_count;
 	int dxfer_dir = hp->dxfer_direction;
-	unsigned char *p;
-	unsigned char __user *up;
 	int new_interface = ('\0' == hp->interface_id) ? 0 : 1;
 
 	if ((SG_DXFER_UNKNOWN == dxfer_dir) || (SG_DXFER_TO_DEV == dxfer_dir) ||
@@ -1866,83 +1863,9 @@ sg_write_xfer(Sg_request * srp)
 	     && ((SG_FLAG_NO_DXFER | SG_FLAG_MMAP_IO) & hp->flags)))
 		return 0;
 
-	SCSI_LOG_TIMEOUT(4, printk("sg_write_xfer: num_xfer=%d, iovec_count=%d, k_use_sg=%d\n",
-			  num_xfer, iovec_count, schp->k_use_sg));
-	if (iovec_count) {
-		onum = iovec_count;
-		if (!access_ok(VERIFY_READ, hp->dxferp, SZ_SG_IOVEC * onum))
-			return -EFAULT;
-	} else
-		onum = 1;
+	SCSI_LOG_TIMEOUT(4, printk("sg_write_xfer: num_xfer=%d, k_use_sg=%d\n",
+			  num_xfer, schp->k_use_sg));
 
-	ksglen = sg->length;
-	p = page_address(sg_page(sg));
-	for (j = 0, k = 0; j < onum; ++j) {
-		res = sg_u_iovec(hp, iovec_count, j, 1, &usglen, &up);
-		if (res)
-			return res;
-
-		for (; p; sg = sg_next(sg), ksglen = sg->length,
-		     p = page_address(sg_page(sg))) {
-			if (usglen <= 0)
-				break;
-			if (ksglen > usglen) {
-				if (usglen >= num_xfer) {
-					if (__copy_from_user(p, up, num_xfer))
-						return -EFAULT;
-					return 0;
-				}
-				if (__copy_from_user(p, up, usglen))
-					return -EFAULT;
-				p += usglen;
-				ksglen -= usglen;
-				break;
-			} else {
-				if (ksglen >= num_xfer) {
-					if (__copy_from_user(p, up, num_xfer))
-						return -EFAULT;
-					return 0;
-				}
-				if (__copy_from_user(p, up, ksglen))
-					return -EFAULT;
-				up += ksglen;
-				usglen -= ksglen;
-			}
-			++k;
-			if (k >= schp->k_use_sg)
-				return 0;
-		}
-	}
-
-	return 0;
-}
-
-static int
-sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind,
-	   int wr_xf, int *countp, unsigned char __user **up)
-{
-	int num_xfer = (int) hp->dxfer_len;
-	unsigned char __user *p = hp->dxferp;
-	int count;
-
-	if (0 == sg_num) {
-		if (wr_xf && ('\0' == hp->interface_id))
-			count = (int) hp->flags;	/* holds "old" input_size */
-		else
-			count = num_xfer;
-	} else {
-		sg_iovec_t iovec;
-		if (__copy_from_user(&iovec, p + ind*SZ_SG_IOVEC, SZ_SG_IOVEC))
-			return -EFAULT;
-		p = iovec.iov_base;
-		count = (int) iovec.iov_len;
-	}
-	if (!access_ok(wr_xf ? VERIFY_READ : VERIFY_WRITE, p, count))
-		return -EFAULT;
-	if (up)
-		*up = p;
-	if (countp)
-		*countp = count;
 	return 0;
 }
 
@@ -1950,21 +1873,18 @@ static void
 sg_remove_scat(Sg_scatter_hold * schp)
 {
 	SCSI_LOG_TIMEOUT(4, printk("sg_remove_scat: k_use_sg=%d\n", schp->k_use_sg));
-	if (schp->buffer && (schp->sglist_len > 0)) {
-		struct scatterlist *sg = schp->buffer;
-
+	if (schp->pages && schp->sglist_len > 0) {
 		if (!schp->dio_in_use) {
 			int k;
 
-			for (k = 0; (k < schp->k_use_sg) && sg_page(sg);
-			     ++k, sg = sg_next(sg)) {
+			for (k = 0; k < schp->k_use_sg && schp->pages[k]; k++) {
 				SCSI_LOG_TIMEOUT(5, printk(
-				    "sg_remove_scat: k=%d, pg=0x%p, len=%d\n",
-				    k, sg_page(sg), sg->length));
-				sg_page_free(sg_page(sg), sg->length);
+				    "sg_remove_scat: k=%d, pg=0x%p\n",
+				    k, schp->pages[k]));
+				__free_pages(schp->pages[k], schp->page_order);
 			}
 
-			kfree(schp->buffer);
+			kfree(schp->pages);
 		}
 	}
 	memset(schp, 0, sizeof (*schp));
@@ -1975,13 +1895,8 @@ sg_read_xfer(Sg_request * srp)
 {
 	sg_io_hdr_t *hp = &srp->header;
 	Sg_scatter_hold *schp = &srp->data;
-	struct scatterlist *sg = schp->buffer;
 	int num_xfer = 0;
-	int j, k, onum, usglen, ksglen, res;
-	int iovec_count = (int) hp->iovec_count;
 	int dxfer_dir = hp->dxfer_direction;
-	unsigned char *p;
-	unsigned char __user *up;
 	int new_interface = ('\0' == hp->interface_id) ? 0 : 1;
 
 	if ((SG_DXFER_UNKNOWN == dxfer_dir) || (SG_DXFER_FROM_DEV == dxfer_dir)
@@ -1996,53 +1911,7 @@ sg_read_xfer(Sg_request * srp)
 		return 0;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_read_xfer: num_xfer=%d, iovec_count=%d, k_use_sg=%d\n",
-			  num_xfer, iovec_count, schp->k_use_sg));
-	if (iovec_count) {
-		onum = iovec_count;
-		if (!access_ok(VERIFY_READ, hp->dxferp, SZ_SG_IOVEC * onum))
-			return -EFAULT;
-	} else
-		onum = 1;
-
-	p = page_address(sg_page(sg));
-	ksglen = sg->length;
-	for (j = 0, k = 0; j < onum; ++j) {
-		res = sg_u_iovec(hp, iovec_count, j, 0, &usglen, &up);
-		if (res)
-			return res;
-
-		for (; p; sg = sg_next(sg), ksglen = sg->length,
-		     p = page_address(sg_page(sg))) {
-			if (usglen <= 0)
-				break;
-			if (ksglen > usglen) {
-				if (usglen >= num_xfer) {
-					if (__copy_to_user(up, p, num_xfer))
-						return -EFAULT;
-					return 0;
-				}
-				if (__copy_to_user(up, p, usglen))
-					return -EFAULT;
-				p += usglen;
-				ksglen -= usglen;
-				break;
-			} else {
-				if (ksglen >= num_xfer) {
-					if (__copy_to_user(up, p, num_xfer))
-						return -EFAULT;
-					return 0;
-				}
-				if (__copy_to_user(up, p, ksglen))
-					return -EFAULT;
-				up += ksglen;
-				usglen -= ksglen;
-			}
-			++k;
-			if (k >= schp->k_use_sg)
-				return 0;
-		}
-	}
-
+			  num_xfer, (int)hp->iovec_count, schp->k_use_sg));
 	return 0;
 }
 
@@ -2050,7 +1919,6 @@ static int
 sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer)
 {
 	Sg_scatter_hold *schp = &srp->data;
-	struct scatterlist *sg = schp->buffer;
 	int k, num;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_read_oxfer: num_read_xfer=%d\n",
@@ -2058,15 +1926,18 @@ sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer)
 	if ((!outp) || (num_read_xfer <= 0))
 		return 0;
 
-	for (k = 0; (k < schp->k_use_sg) && sg_page(sg); ++k, sg = sg_next(sg)) {
-		num = sg->length;
+	blk_rq_unmap_user(srp->bio);
+	srp->bio = NULL;
+
+	num = 1 << (PAGE_SHIFT + schp->page_order);
+	for (k = 0; k < schp->k_use_sg && schp->pages[k]; k++) {
 		if (num > num_read_xfer) {
-			if (__copy_to_user(outp, page_address(sg_page(sg)),
+			if (__copy_to_user(outp, page_address(schp->pages[k]),
 					   num_read_xfer))
 				return -EFAULT;
 			break;
 		} else {
-			if (__copy_to_user(outp, page_address(sg_page(sg)),
+			if (__copy_to_user(outp, page_address(schp->pages[k]),
 					   num))
 				return -EFAULT;
 			num_read_xfer -= num;
@@ -2101,24 +1972,22 @@ sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size)
 {
 	Sg_scatter_hold *req_schp = &srp->data;
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
-	struct scatterlist *sg = rsv_schp->buffer;
 	int k, num, rem;
 
 	srp->res_used = 1;
 	SCSI_LOG_TIMEOUT(4, printk("sg_link_reserve: size=%d\n", size));
 	rem = size;
 
-	for (k = 0; k < rsv_schp->k_use_sg; ++k, sg = sg_next(sg)) {
-		num = sg->length;
+	num = 1 << (PAGE_SHIFT + rsv_schp->page_order);
+	for (k = 0; k < rsv_schp->k_use_sg; k++) {
 		if (rem <= num) {
-			sfp->save_scat_len = num;
-			sg->length = rem;
 			req_schp->k_use_sg = k + 1;
 			req_schp->sglist_len = rsv_schp->sglist_len;
-			req_schp->buffer = rsv_schp->buffer;
+			req_schp->pages = rsv_schp->pages;
 
 			req_schp->bufflen = size;
 			req_schp->b_malloc_len = rsv_schp->b_malloc_len;
+			req_schp->page_order = rsv_schp->page_order;
 			break;
 		} else
 			rem -= num;
@@ -2132,22 +2001,13 @@ static void
 sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp)
 {
 	Sg_scatter_hold *req_schp = &srp->data;
-	Sg_scatter_hold *rsv_schp = &sfp->reserve;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_unlink_reserve: req->k_use_sg=%d\n",
 				   (int) req_schp->k_use_sg));
-	if ((rsv_schp->k_use_sg > 0) && (req_schp->k_use_sg > 0)) {
-		struct scatterlist *sg = rsv_schp->buffer;
-
-		if (sfp->save_scat_len > 0)
-			(sg + (req_schp->k_use_sg - 1))->length =
-			    (unsigned) sfp->save_scat_len;
-		else
-			SCSI_LOG_TIMEOUT(1, printk ("sg_unlink_reserve: BAD save_scat_len\n"));
-	}
 	req_schp->k_use_sg = 0;
 	req_schp->bufflen = 0;
-	req_schp->buffer = NULL;
+	req_schp->pages = NULL;
+	req_schp->page_order = 0;
 	req_schp->sglist_len = 0;
 	sfp->save_scat_len = 0;
 	srp->res_used = 0;
@@ -2405,53 +2265,6 @@ sg_res_in_use(Sg_fd * sfp)
 	return srp ? 1 : 0;
 }
 
-/* The size fetched (value output via retSzp) set when non-NULL return */
-static struct page *
-sg_page_malloc(int rqSz, int lowDma, int *retSzp)
-{
-	struct page *resp = NULL;
-	gfp_t page_mask;
-	int order, a_size;
-	int resSz;
-
-	if ((rqSz <= 0) || (NULL == retSzp))
-		return resp;
-
-	if (lowDma)
-		page_mask = GFP_ATOMIC | GFP_DMA | __GFP_COMP | __GFP_NOWARN;
-	else
-		page_mask = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
-
-	for (order = 0, a_size = PAGE_SIZE; a_size < rqSz;
-	     order++, a_size <<= 1) ;
-	resSz = a_size;		/* rounded up if necessary */
-	resp = alloc_pages(page_mask, order);
-	while ((!resp) && order) {
-		--order;
-		a_size >>= 1;	/* divide by 2, until PAGE_SIZE */
-		resp =  alloc_pages(page_mask, order);	/* try half */
-		resSz = a_size;
-	}
-	if (resp) {
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
-			memset(page_address(resp), 0, resSz);
-		*retSzp = resSz;
-	}
-	return resp;
-}
-
-static void
-sg_page_free(struct page *page, int size)
-{
-	int order, a_size;
-
-	if (!page)
-		return;
-	for (order = 0, a_size = PAGE_SIZE; a_size < size;
-	     order++, a_size <<= 1) ;
-	__free_pages(page, order);
-}
-
 #ifdef CONFIG_SCSI_PROC_FS
 static int
 sg_idr_max_id(int id, void *p, void *data)

From 4d8ab62e087d9300883b82c2662e73e6eef803a3 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 15:05:57 +0900
Subject: [PATCH 067/132] bio: convert bio_copy_kern to use bio_copy_user

bio_copy_kern and bio_copy_user are very similar. This converts
bio_copy_kern to use bio_copy_user.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 54 ++++--------------------------------------------------
 1 file changed, 4 insertions(+), 50 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index a2f072647cdf..9d68ddb89b71 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -995,48 +995,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 			  gfp_t gfp_mask, int reading)
 {
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
 	struct bio *bio;
 	struct bio_vec *bvec;
-	struct bio_map_data *bmd;
-	int i, ret;
-	struct sg_iovec iov;
+	int i;
 
-	iov.iov_base = data;
-	iov.iov_len = len;
-
-	bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask);
-	if (!bmd)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	bio = bio_alloc(gfp_mask, nr_pages);
-	if (!bio)
-		goto out_bmd;
-
-	while (len) {
-		struct page *page;
-		unsigned int bytes = PAGE_SIZE;
-
-		if (bytes > len)
-			bytes = len;
-
-		page = alloc_page(q->bounce_gfp | gfp_mask);
-		if (!page) {
-			ret = -ENOMEM;
-			goto cleanup;
-		}
-
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
-			ret = -EINVAL;
-			goto cleanup;
-		}
-
-		len -= bytes;
-	}
+	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
+	if (IS_ERR(bio))
+		return bio;
 
 	if (!reading) {
 		void *p = data;
@@ -1049,20 +1014,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 		}
 	}
 
-	bio->bi_private = bmd;
 	bio->bi_end_io = bio_copy_kern_endio;
 
-	bio_set_map_data(bmd, bio, &iov, 1, 1);
 	return bio;
-cleanup:
-	bio_for_each_segment(bvec, bio, i)
-		__free_page(bvec->bv_page);
-
-	bio_put(bio);
-out_bmd:
-	bio_free_map_data(bmd);
-
-	return ERR_PTR(ret);
 }
 
 /*

From 879040742cf09f2360a9ac41846288707e4e567c Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 15:05:58 +0900
Subject: [PATCH 068/132] block: add blk_rq_aligned helper function

This adds blk_rq_aligned helper function to see if alignment and
padding requirement is satisfied for DMA transfer. This also converts
blk_rq_map_kern and __blk_rq_map_user to use the helper function.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c        | 12 ++----------
 include/linux/blkdev.h |  7 +++++++
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index dad6a2907835..572140cda5ff 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -45,7 +45,6 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 			     unsigned int len, gfp_t gfp_mask)
 {
 	unsigned long uaddr;
-	unsigned int alignment;
 	struct bio *bio, *orig_bio;
 	int reading, ret;
 
@@ -56,8 +55,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
-	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	if (!(uaddr & alignment) && !(len & alignment) && !map_data)
+	if (blk_rq_aligned(q, ubuf, len) && !map_data)
 		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
 	else
 		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -274,8 +272,6 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
-	unsigned long kaddr;
-	unsigned int alignment;
 	int reading = rq_data_dir(rq) == READ;
 	int do_copy = 0;
 	struct bio *bio;
@@ -285,11 +281,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (!len || !kbuf)
 		return -EINVAL;
 
-	kaddr = (unsigned long)kbuf;
-	alignment = queue_dma_alignment(q) | q->dma_pad_mask;
-	do_copy = ((kaddr & alignment) || (len & alignment) ||
-		   object_is_on_stack(kbuf));
-
+	do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf);
 	if (do_copy)
 		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
 	else
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 358ac423ed2f..9c2549260427 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -899,6 +899,13 @@ static inline int queue_dma_alignment(struct request_queue *q)
 	return q ? q->dma_alignment : 511;
 }
 
+static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+				 unsigned int len)
+{
+	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
+	return !((unsigned long)addr & alignment) && !(len & alignment);
+}
+
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {

From 01cfcddd98f09e05a2e36031654ed46643b76f23 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 28 Aug 2008 15:05:59 +0900
Subject: [PATCH 069/132] sg: use blk_rq_aligned helper function

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Douglas Gilbert <dougg@torque.net>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index d6391666502c..ed69292babde 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1656,11 +1656,9 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	sg_io_hdr_t *hp = &srp->header;
 	int dxfer_len = (int) hp->dxfer_len;
 	int dxfer_dir = hp->dxfer_direction;
-	unsigned long uaddr = (unsigned long)hp->dxferp;
 	Sg_scatter_hold *req_schp = &srp->data;
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
 	struct request_queue *q = sfp->parentdp->device->request_queue;
-	unsigned long alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 	struct rq_map_data map_data;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_start_req: dxfer_len=%d\n", dxfer_len));
@@ -1676,7 +1674,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
 	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
 	    (!sfp->parentdp->device->host->unchecked_isa_dma) &&
-	    !(uaddr & alignment) && !(dxfer_len & alignment))
+	    blk_rq_aligned(q, hp->dxferp, dxfer_len))
 		return sg_build_direct(srp, sfp, dxfer_len);
 #endif
 	if ((!sg_res_in_use(sfp)) && (dxfer_len <= rsv_schp->bufflen))

From aeb3d3a81e81c6323a17fe914e91eb228b3f1aa1 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Thu, 28 Aug 2008 09:27:42 +0200
Subject: [PATCH 070/132] block: kmalloc args reversed, small function
 definition fixes

Noticed by sparse:
block/blk-softirq.c:156:12: warning: symbol 'blk_softirq_init' was not declared. Should it be static?
block/genhd.c:583:28: warning: function 'bdget_disk' with external linkage has definition
block/genhd.c:659:17: warning: incorrect type in argument 1 (different base types)
block/genhd.c:659:17:    expected unsigned int [unsigned] [usertype] size
block/genhd.c:659:17:    got restricted gfp_t
block/genhd.c:659:29: warning: incorrect type in argument 2 (different base types)
block/genhd.c:659:29:    expected restricted gfp_t [usertype] flags
block/genhd.c:659:29:    got unsigned int
block: kmalloc args reversed

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 2 +-
 block/genhd.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a60e959a12c4..d70692badcdb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -443,7 +443,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-int __init blk_settings_init(void)
+static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
diff --git a/block/genhd.c b/block/genhd.c
index d9de3e482d1e..32ee73c67560 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -578,7 +578,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
  * RETURNS:
  * Resulting block_device on success, NULL on failure.
  */
-extern struct block_device *bdget_disk(struct gendisk *disk, int partno)
+struct block_device *bdget_disk(struct gendisk *disk, int partno)
 {
 	struct hd_struct *part;
 	struct block_device *bdev = NULL;
@@ -654,7 +654,7 @@ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos)
 	struct class_dev_iter *iter;
 	struct device *dev;
 
-	iter = kmalloc(GFP_KERNEL, sizeof(*iter));
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return ERR_PTR(-ENOMEM);
 

From 759f8ca3048f7438aa3129268d7252552505d662 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 29 Aug 2008 09:06:29 +0200
Subject: [PATCH 071/132] Change default value of CONFIG_DEBUG_BLOCK_EXT_DEVT
 to 'n'

It's a debug option that you would explicitly enable to test this
feature, we should default it to 'n' to prevent accidental surprises
for now.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5a536f703a83..4378d5e923ca 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -628,7 +628,7 @@ config DEBUG_BLOCK_EXT_DEVT
         bool "Force extended block device numbers and spread them"
 	depends on DEBUG_KERNEL
 	depends on BLOCK
-	default y
+	default n
 	help
 	  Conventionally, block device numbers are allocated from
 	  predetermined contiguous area.  However, extended block area

From 2bbedcb4c1abac498f18e5770d62ae66ff235ada Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Aug 2008 11:41:51 +0200
Subject: [PATCH 072/132] block: don't test for partition size in bdget_disk()
 and blk_lookup_devt()

bdget_disk() and blk_lookup_devt() never cared whether the specified
partition (or disk) is zero sized or not.  I got confused while
converting those not to depend on consecutive minor numbers in commit
5a6411b1178baf534aa9138052864dfa89d3eada and later when dev0 was added
it broke callers which expected to get valid return for zero sized
disk devices.

So, they never needed nr_sects checks in the first place.  Kill them.

This problem was spotted and debugged by Bartlmoiej Zolnierkiewicz.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 32ee73c67560..ed926b760ca0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -584,7 +584,7 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno)
 	struct block_device *bdev = NULL;
 
 	part = disk_get_part(disk, partno);
-	if (part && (part->nr_sects || partno == 0))
+	if (part)
 		bdev = bdget(part_devt(part));
 	disk_put_part(part);
 
@@ -1031,7 +1031,7 @@ dev_t blk_lookup_devt(const char *name, int partno)
 			continue;
 
 		part = disk_get_part(disk, partno);
-		if (part && (part->nr_sects || partno == 0)) {
+		if (part) {
 			devt = part_devt(part);
 			disk_put_part(part);
 			break;

From 55dc7db70a73a3809a2334063c9b5b0d8ccebdaa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 1 Sep 2008 13:44:35 +0200
Subject: [PATCH 073/132] init: DEBUG_BLOCK_EXT_DEVT requires explicit root=
 param

DEBUG_BLOCK_EXT_DEVT shuffles SCSI and IDE device numbers and root
device number set using rdev become meaningless.  Root devices should
be explicitly specified using textual names.  Warn about it if root
can't be found and DEBUG_BLOCK_EXT_DEVT is enabled.  Also, add warning
to the help text.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 init/do_mounts.c  | 4 ++++
 lib/Kconfig.debug | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 3715feb8446d..d055b1914c3d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -263,6 +263,10 @@ retry:
 		printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
 
 		printk_all_partitions();
+#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
+		printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify "
+		       "explicit textual name for \"root=\" boot option.\n");
+#endif
 		panic("VFS: Unable to mount root fs on %s", b);
 	}
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4378d5e923ca..c556896abe57 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -638,6 +638,12 @@ config DEBUG_BLOCK_EXT_DEVT
 	  userland code paths which assume predetermined contiguous
 	  device number allocation.
 
+	  Note that turning on this debug option shuffles all the
+	  device numbers for all IDE and SCSI devices including libata
+	  ones, so root partition specified using device number
+	  directly (via rdev or root=MAJ:MIN) won't work anymore.
+	  Textual device names (root=/dev/sdXn) will continue to work.
+
 	  Say N if you are unsure.
 
 config LKDTM

From 839e96afba87117befd39cf4e43f156edc8047a7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 2 Sep 2008 09:25:21 +0200
Subject: [PATCH 074/132] block: update comment on end_request()

It refers to functions that no longer exist after the IO completion
changes.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5bf806adc770..f25eb9786d94 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1780,9 +1780,9 @@ EXPORT_SYMBOL(end_dequeued_request);
  *     they have a residual value to account for. For that case this function
  *     isn't really useful, unless the residual just happens to be the
  *     full current segment. In other words, don't use this function in new
- *     code. Either use end_request_completely(), or the
- *     end_that_request_chunk() (along with end_that_request_last()) for
- *     partial completions.
+ *     code. Use blk_end_request() or __blk_end_request() to end partial parts
+ *     of a request, or end_dequeued_request() and end_queued_request() to
+ *     completely end IO on a dequeued/queued request.
  *
  **/
 void end_request(struct request *req, int uptodate)

From 818827669d85b84241696ffef2de485db46b0b5e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 16:20:19 +0900
Subject: [PATCH 075/132] block: make blk_rq_map_user take a NULL user-space
 buffer

This patch changes blk_rq_map_user to accept a NULL user-space buffer
with a READ command if rq_map_data is not NULL. Thus a caller can pass
page frames to lk_rq_map_user to just set up a request and bios with
page frames propely. bio_uncopy_user (called via blk_rq_unmap_user)
doesn't copy data to user space with such request.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c     | 16 ++++++++++++----
 fs/bio.c            |  8 ++++----
 include/linux/bio.h |  1 +
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index 572140cda5ff..4849fa36161e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -42,7 +42,7 @@ static int __blk_rq_unmap_user(struct bio *bio)
 
 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 			     struct rq_map_data *map_data, void __user *ubuf,
-			     unsigned int len, gfp_t gfp_mask)
+			     unsigned int len, int null_mapped, gfp_t gfp_mask)
 {
 	unsigned long uaddr;
 	struct bio *bio, *orig_bio;
@@ -63,6 +63,9 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
+	if (null_mapped)
+		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+
 	orig_bio = bio;
 	blk_queue_bounce(q, &bio);
 
@@ -111,12 +114,17 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 {
 	unsigned long bytes_read = 0;
 	struct bio *bio = NULL;
-	int ret;
+	int ret, null_mapped = 0;
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
-	if (!len || !ubuf)
+	if (!len)
 		return -EINVAL;
+	if (!ubuf) {
+		if (!map_data || rq_data_dir(rq) != READ)
+			return -EINVAL;
+		null_mapped = 1;
+	}
 
 	while (bytes_read != len) {
 		unsigned long map_len, end, start;
@@ -135,7 +143,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 			map_len -= PAGE_SIZE;
 
 		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
-					gfp_mask);
+					null_mapped, gfp_mask);
 		if (ret < 0)
 			goto unmap_rq;
 		if (!bio)
diff --git a/fs/bio.c b/fs/bio.c
index 9d68ddb89b71..355302985e22 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -547,11 +547,11 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 int bio_uncopy_user(struct bio *bio)
 {
 	struct bio_map_data *bmd = bio->bi_private;
-	int ret;
-
-	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1,
-			     bmd->is_our_pages);
+	int ret = 0;
 
+	if (!bio_flagged(bio, BIO_NULL_MAPPED))
+		ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+				     bmd->nr_sgvecs, 1, bmd->is_our_pages);
 	bio_free_map_data(bmd);
 	bio_put(bio);
 	return ret;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index bc386cd5e996..7af373f253dc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -108,6 +108,7 @@ struct bio {
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
+#define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*

From fad7f01e61bf737fe8a3740d803f000db57ecac6 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 16:20:20 +0900
Subject: [PATCH 076/132] sg: set dxferp to NULL for READ with the older SG
 interface

With the older SG interface, we don't know a user-space address to
trasfer data when executing a SCSI command. So we can't pass a
user-space address to blk_rq_map_user.

This patch fixes sg to pass a NULL user-space address to
blk_rq_map_user so that it just sets up a request and bios with page
frames propely without data transfer.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index ed69292babde..50c07bca727d 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -615,7 +615,10 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 	else
 		hp->dxfer_direction = (mxsize > 0) ? SG_DXFER_FROM_DEV : SG_DXFER_NONE;
 	hp->dxfer_len = mxsize;
-	hp->dxferp = (char __user *)buf + cmd_size;
+	if (hp->dxfer_direction == SG_DXFER_TO_DEV)
+		hp->dxferp = (char __user *)buf + cmd_size;
+	else
+		hp->dxferp = NULL;
 	hp->sbp = NULL;
 	hp->timeout = old_hdr.reply_len;	/* structure abuse ... */
 	hp->flags = input_size;	/* structure abuse ... */

From 243294dae09c909c0442c8f04d470b69c3c19d6e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 4 Sep 2008 09:17:31 +0200
Subject: [PATCH 077/132] block: fix duplicate headers for /proc/partitions

seqf can be started multiple times for a read and the header should be
printed only for the initial one.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index ed926b760ca0..8acaff0154e3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -697,7 +697,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
 	static void *p;
 
 	p = disk_seqf_start(seqf, pos);
-	if (!IS_ERR(p) && p)
+	if (!IS_ERR(p) && p && !*pos)
 		seq_puts(seqf, "major minor  #blocks  name\n\n");
 	return p;
 }

From 0c002c2f74e10baa9021d3ecc50585c6eafea568 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:20 -0600
Subject: [PATCH 078/132] Wrapper for lower-level revalidate_disk routines.

This is a wrapper for the lower-level revalidate_disk call-backs such
as sd_revalidate_disk(). It allows us to perform pre and post
operations when calling them.

We will use this wrapper in a later patch to adjust block device sizes
after an online resize (a _post_ operation).

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c     | 21 +++++++++++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 22 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3fa19bd64df..4eeb69a88734 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,27 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk
+ *                   call-back
+ *
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs.  It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+	int ret = 0;
+
+	if (disk->fops->revalidate_disk)
+		ret = disk->fops->revalidate_disk(disk);
+
+	return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02a9fb5a830c..d63461f97983 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1722,6 +1722,7 @@ extern int fs_may_remount_ro(struct super_block *);
  */
 #define bio_data_dir(bio)	((bio)->bi_rw & 1)
 
+extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);

From c3279d1454cdfed02a557d789d8a6d08ab4cbe70 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:25 -0600
Subject: [PATCH 079/132] Adjust block device size after an online resize of a
 disk.

The revalidate_disk routine now checks if a disk has been resized by
comparing the gendisk capacity to the bdev inode size.  If they are
different (usually because the disk has been resized underneath the kernel)
the bdev inode size is adjusted to match the capacity.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c     | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 39 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4eeb69a88734..b721955d382e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,34 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * check_disk_size_change - checks for disk size change and adjusts
+ *                          bdev size.
+ *
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+	loff_t disk_size, bdev_size;
+
+	disk_size = (loff_t)get_capacity(disk) << 9;
+	bdev_size = i_size_read(bdev->bd_inode);
+	if (disk_size != bdev_size) {
+		char name[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, name);
+		printk(KERN_INFO
+		       "%s: detected capacity change from %lld to %lld\n",
+		       name, bdev_size, disk_size);
+		i_size_write(bdev->bd_inode, disk_size);
+	}
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
 /**
  * revalidate_disk - wrapper for lower-level driver's revalidate_disk
  *                   call-back
@@ -864,11 +892,20 @@ EXPORT_SYMBOL(open_by_devnum);
  */
 int revalidate_disk(struct gendisk *disk)
 {
+	struct block_device *bdev;
 	int ret = 0;
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
 
+	bdev = bdget_disk(disk, 0);
+	if (!bdev)
+		return ret;
+
+	mutex_lock(&bdev->bd_mutex);
+	check_disk_size_change(disk, bdev);
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
 	return ret;
 }
 EXPORT_SYMBOL(revalidate_disk);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d63461f97983..32477e8872d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1722,6 +1722,8 @@ extern int fs_may_remount_ro(struct super_block *);
  */
 #define bio_data_dir(bio)	((bio)->bi_rw & 1)
 
+extern void check_disk_size_change(struct gendisk *disk,
+				   struct block_device *bdev);
 extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
 extern int __invalidate_device(struct block_device *);

From 9bc3ffbfbdf71fefda8a261ef8d6fdc388a29b42 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:30 -0600
Subject: [PATCH 080/132] Check for device resize when rescanning partitions

Check for device resize in the rescan_partitions() routine. If the device
has been resized, the bdev size is set to match. The rescan_partitions()
routine is called when opening the device and when calling the
BLKRRPART ioctl.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0e411603fdf5..7408227c49c9 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -504,7 +504,6 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 	res = invalidate_partition(disk, 0);
 	if (res)
 		return res;
-	bdev->bd_invalidated = 0;
 
 	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
 	while ((part = disk_part_iter_next(&piter)))
@@ -513,6 +512,8 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
 		return 0;
 	if (IS_ERR(state))	/* I/O error reading the partition table */

From f98a8cae12f2b2a8f9bfd7a53c990a1a405e880e Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:35 -0600
Subject: [PATCH 081/132] SCSI sd driver calls revalidate_disk wrapper.

Modify the SCSI disk driver to call the revalidate_disk()
wrapper. This allows us to do some housekeeping such as accounting for
a disk being resized online. The wrapper will call
sd_revalidate_disk() at the appropriate time.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bcb04b2a7676..cb115d1bf228 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -165,7 +165,7 @@ sd_store_cache_type(struct device *dev, struct device_attribute *attr,
 			sd_print_sense_hdr(sdkp, &sshdr);
 		return -EINVAL;
 	}
-	sd_revalidate_disk(sdkp->disk);
+	revalidate_disk(sdkp->disk);
 	return count;
 }
 
@@ -916,7 +916,7 @@ static void sd_rescan(struct device *dev)
 	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);
 
 	if (sdkp) {
-		sd_revalidate_disk(sdkp->disk);
+		revalidate_disk(sdkp->disk);
 		scsi_disk_put(sdkp);
 	}
 }

From 56ade44b46780fa291fa68b824f1dafdcb11b0ca Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:40 -0600
Subject: [PATCH 082/132] Added flush_disk to factor out common buffer cache
 flushing code.

We need to be able to flush the buffer cache for for more than
just when a disk is changed, so we factor out common cache flush code
in check_disk_change() to an internal flush_disk() routine.  This
routine will then be used for both disk changes and disk resizes (in a
later patch).

Include the disk name in the text indicating that there are busy
inodes on the device and increase the KERN severity of the message.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b721955d382e..33650fc537c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -852,6 +852,32 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
 
 EXPORT_SYMBOL(open_by_devnum);
 
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev:      struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+	if (__invalidate_device(bdev)) {
+		char name[BDEVNAME_SIZE] = "";
+
+		if (bdev->bd_disk)
+			disk_name(bdev->bd_disk, 0, name);
+		printk(KERN_WARNING "VFS: busy inodes on changed media or "
+		       "resized disk %s\n", name);
+	}
+
+	if (!bdev->bd_disk)
+		return;
+	if (disk_partitionable(bdev->bd_disk))
+		bdev->bd_invalidated = 1;
+}
+
 /**
  * check_disk_size_change - checks for disk size change and adjusts
  *                          bdev size.
@@ -929,13 +955,9 @@ int check_disk_change(struct block_device *bdev)
 	if (!bdops->media_changed(bdev->bd_disk))
 		return 0;
 
-	if (__invalidate_device(bdev))
-		printk("VFS: busy inodes on changed media.\n");
-
+	flush_disk(bdev);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
-	if (disk_partitionable(bdev->bd_disk))
-		bdev->bd_invalidated = 1;
 	return 1;
 }
 

From 608aeef17a91747d6303de4df5e2c2e6899a95e8 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Thu, 4 Sep 2008 14:27:45 -0600
Subject: [PATCH 083/132] Call flush_disk() after detecting an online resize.

We call flush_disk() to make sure the buffer cache for the disk is
flushed after a disk resize. There are two resize cases, growing and
shrinking. Given that users can shrink/then grow a disk before
revalidate_disk() is called, we treat the grow case identically to
shrinking. We need to flush the buffer cache after an online shrink
because, as James Bottomley puts it,

     The two use cases for shrinking I can see are

     1. planned: the fs is already shrunk to within the new boundaries
        and all data is relocated, so invalidate is fine (any dirty
        buffers that might exist in the shrunk region are there only
        because they were relocated but not yet written to their
        original location).
     2. unplanned:  In this case, the fs is probably toast, so whether
        we invalidate or not isn't going to make a whole lot of
        difference; it's still going to try to read or write from
        sectors beyond the new size and get I/O errors.

Immediately invalidating shrunk disks will cause errors for outstanding
I/Os for reads/write beyond the new end of the disk to be generated
earlier then if we waited for the normal buffer cache operation. It also
removes a potential security hole where we might keep old data around
from beyond the end of the shrunk disk if the disk was not invalidated.

Signed-off-by: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 33650fc537c4..57e2786dd2a5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -902,6 +902,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
+		flush_disk(bdev);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);

From 242f9dcb8ba6f68fcd217a119a7648a4f69290e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 14 Sep 2008 05:55:09 -0700
Subject: [PATCH 084/132] block: unify request timeout handling

Right now SCSI and others do their own command timeout handling.
Move those bits to the block layer.

Instead of having a timer per command, we try to be a bit more clever
and simply have one per-queue. This avoids the overhead of having to
tear down and setup a timer for each command, so it will result in a lot
less timer fiddling.

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile                       |   4 +-
 block/blk-core.c                     |   7 ++
 block/blk-settings.c                 |  12 +++
 block/blk-softirq.c                  |  30 +++---
 block/blk-timeout.c                  | 155 +++++++++++++++++++++++++++
 block/blk.h                          |  24 +++++
 block/elevator.c                     |   8 ++
 drivers/ata/libata-eh.c              |  13 +--
 drivers/ata/libata.h                 |   2 +-
 drivers/scsi/aacraid/aachba.c        |   2 +-
 drivers/scsi/gdth.c                  |  60 +++++++----
 drivers/scsi/gdth.h                  |   2 +-
 drivers/scsi/gdth_proc.c             |  66 ------------
 drivers/scsi/gdth_proc.h             |   3 -
 drivers/scsi/ibmvscsi/ibmvscsi.c     |   2 +-
 drivers/scsi/ide-scsi.c              |   2 +-
 drivers/scsi/ipr.c                   |   3 +-
 drivers/scsi/ips.c                   |   2 +-
 drivers/scsi/libiscsi.c              |  17 +--
 drivers/scsi/libsas/sas_ata.c        |   2 +-
 drivers/scsi/libsas/sas_internal.h   |   2 +-
 drivers/scsi/libsas/sas_scsi_host.c  |  30 +++---
 drivers/scsi/megaraid/megaraid_sas.c |   6 +-
 drivers/scsi/ncr53c8xx.c             |   4 +-
 drivers/scsi/qla1280.c               |   4 +-
 drivers/scsi/qla4xxx/ql4_os.c        |   4 +-
 drivers/scsi/scsi.c                  |  92 +++-------------
 drivers/scsi/scsi_error.c            |  90 +++-------------
 drivers/scsi/scsi_lib.c              |  17 ++-
 drivers/scsi/scsi_priv.h             |   7 +-
 drivers/scsi/scsi_sysfs.c            |   7 +-
 drivers/scsi/scsi_transport_fc.c     |   6 +-
 drivers/scsi/sd.c                    |   9 +-
 drivers/scsi/sr.c                    |   5 +-
 drivers/scsi/sym53c8xx_2/sym_glue.c  |   4 +-
 include/linux/blkdev.h               |  20 ++++
 include/scsi/scsi_cmnd.h             |   3 -
 include/scsi/scsi_host.h             |   9 +-
 include/scsi/scsi_transport.h        |   3 +-
 39 files changed, 399 insertions(+), 339 deletions(-)
 create mode 100644 block/blk-timeout.c

diff --git a/block/Makefile b/block/Makefile
index 0da976ce67dd..bfe73049f939 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,8 +4,8 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o blk-softirq.o ioctl.o genhd.o \
-			scsi_ioctl.o cmd-filter.o
+			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+			ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/blk-core.c b/block/blk-core.c
index f25eb9786d94..d768a8ddc173 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -110,6 +110,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	memset(rq, 0, sizeof(*rq));
 
 	INIT_LIST_HEAD(&rq->queuelist);
+	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
 	rq->sector = rq->hard_sector = (sector_t) -1;
@@ -490,6 +491,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	}
 
 	init_timer(&q->unplug_timer);
+	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+	INIT_LIST_HEAD(&q->timeout_list);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
@@ -897,6 +900,8 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+	blk_delete_timer(rq);
+	blk_clear_rq_complete(rq);
 	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 
 	if (blk_rq_tagged(rq))
@@ -1650,6 +1655,8 @@ static void end_that_request_last(struct request *req, int error)
 {
 	struct gendisk *disk = req->rq_disk;
 
+	blk_delete_timer(req);
+
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index d70692badcdb..1d0330d0b40a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -77,6 +77,18 @@ void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 }
 EXPORT_SYMBOL(blk_queue_softirq_done);
 
+void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
+{
+	q->rq_timeout = timeout;
+}
+EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
+
+void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
+{
+	q->rq_timed_out_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 3a1af551191e..7ab344afb16f 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -101,18 +101,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 	.notifier_call	= blk_cpu_notify,
 };
 
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-void blk_complete_request(struct request *req)
+void __blk_complete_request(struct request *req)
 {
 	struct request_queue *q = req->q;
 	unsigned long flags;
@@ -151,6 +140,23 @@ do_local:
 
 	local_irq_restore(flags);
 }
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completion callback
+ *     through requeueing. The actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+void blk_complete_request(struct request *req)
+{
+	if (!blk_mark_rq_complete(req))
+		__blk_complete_request(req);
+}
 EXPORT_SYMBOL(blk_complete_request);
 
 __init int blk_softirq_init(void)
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
new file mode 100644
index 000000000000..b36d07bf0afb
--- /dev/null
+++ b/block/blk-timeout.c
@@ -0,0 +1,155 @@
+/*
+ * Functions related to generic timeout handling of requests.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+
+/*
+ * blk_delete_timer - Delete/cancel timer for a given function.
+ * @req:	request that we are canceling timer for
+ *
+ */
+void blk_delete_timer(struct request *req)
+{
+	struct request_queue *q = req->q;
+
+	/*
+	 * Nothing to detach
+	 */
+	if (!q->rq_timed_out_fn || !req->deadline)
+		return;
+
+	list_del_init(&req->timeout_list);
+
+	if (list_empty(&q->timeout_list))
+		del_timer(&q->timeout);
+}
+
+static void blk_rq_timed_out(struct request *req)
+{
+	struct request_queue *q = req->q;
+	enum blk_eh_timer_return ret;
+
+	ret = q->rq_timed_out_fn(req);
+	switch (ret) {
+	case BLK_EH_HANDLED:
+		__blk_complete_request(req);
+		break;
+	case BLK_EH_RESET_TIMER:
+		blk_clear_rq_complete(req);
+		blk_add_timer(req);
+		break;
+	case BLK_EH_NOT_HANDLED:
+		/*
+		 * LLD handles this for now but in the future
+		 * we can send a request msg to abort the command
+		 * and we can move more of the generic scsi eh code to
+		 * the blk layer.
+		 */
+		break;
+	default:
+		printk(KERN_ERR "block: bad eh return: %d\n", ret);
+		break;
+	}
+}
+
+void blk_rq_timed_out_timer(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *) data;
+	unsigned long flags, uninitialized_var(next), next_set = 0;
+	struct request *rq, *tmp;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
+		if (time_after_eq(jiffies, rq->deadline)) {
+			list_del_init(&rq->timeout_list);
+
+			/*
+			 * Check if we raced with end io completion
+			 */
+			if (blk_mark_rq_complete(rq))
+				continue;
+			blk_rq_timed_out(rq);
+		}
+		if (!next_set) {
+			next = rq->deadline;
+			next_set = 1;
+		} else if (time_after(next, rq->deadline))
+			next = rq->deadline;
+	}
+
+	if (next_set && !list_empty(&q->timeout_list))
+		mod_timer(&q->timeout, round_jiffies(next));
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/**
+ * blk_abort_request -- Request request recovery for the specified command
+ * @req:	pointer to the request of interest
+ *
+ * This function requests that the block layer start recovery for the
+ * request by deleting the timer and calling the q's timeout function.
+ * LLDDs who implement their own error recovery MAY ignore the timeout
+ * event if they generated blk_abort_req. Must hold queue lock.
+ */
+void blk_abort_request(struct request *req)
+{
+	blk_delete_timer(req);
+	blk_rq_timed_out(req);
+}
+EXPORT_SYMBOL_GPL(blk_abort_request);
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:	request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
+{
+	struct request_queue *q = req->q;
+	unsigned long expiry;
+
+	if (!q->rq_timed_out_fn)
+		return;
+
+	BUG_ON(!list_empty(&req->timeout_list));
+	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+
+	if (req->timeout)
+		req->deadline = jiffies + req->timeout;
+	else {
+		req->deadline = jiffies + q->rq_timeout;
+		/*
+		 * Some LLDs, like scsi, peek at the timeout to prevent
+		 * a command from being retried forever.
+		 */
+		req->timeout = q->rq_timeout;
+	}
+	list_add_tail(&req->timeout_list, &q->timeout_list);
+
+	/*
+	 * If the timer isn't already pending or this timeout is earlier
+	 * than an existing one, modify the timer. Round to next nearest
+	 * second.
+	 */
+	expiry = round_jiffies(req->deadline);
+
+	/*
+	 * We use ->deadline == 0 to detect whether a timer was added or
+	 * not, so just increase to next jiffy for that specific case
+	 */
+	if (unlikely(!req->deadline))
+		req->deadline = 1;
+
+	if (!timer_pending(&q->timeout) ||
+	    time_before(expiry, q->timeout.expires))
+		mod_timer(&q->timeout, expiry);
+}
diff --git a/block/blk.h b/block/blk.h
index de74254cb916..a4f4a50aefaa 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,30 @@ void __blk_queue_free_tags(struct request_queue *q);
 
 void blk_unplug_work(struct work_struct *work);
 void blk_unplug_timeout(unsigned long data);
+void blk_rq_timed_out_timer(unsigned long data);
+void blk_delete_timer(struct request *);
+void blk_add_timer(struct request *);
+
+/*
+ * Internal atomic flags for request handling
+ */
+enum rq_atomic_flags {
+	REQ_ATOM_COMPLETE = 0,
+};
+
+/*
+ * EH timer and IO completion will both attempt to 'grab' the request, make
+ * sure that only one of them suceeds
+ */
+static inline int blk_mark_rq_complete(struct request *rq)
+{
+	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+}
+
+static inline void blk_clear_rq_complete(struct request *rq)
+{
+	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+}
 
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
diff --git a/block/elevator.c b/block/elevator.c
index 8e3fc3afc77b..a91fc59edd01 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -36,6 +36,8 @@
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include "blk.h"
+
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
@@ -771,6 +773,12 @@ struct request *elv_next_request(struct request_queue *q)
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+
+			/*
+			 * We are now handing the request to the hardware,
+			 * add the timeout handler
+			 */
+			blk_add_timer(rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index c1db2f234d2e..bd0b2bc76f10 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -33,6 +33,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/blkdev.h>
 #include <linux/pci.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
@@ -457,29 +458,29 @@ static void ata_eh_clear_action(struct ata_link *link, struct ata_device *dev,
  *	RETURNS:
  *	EH_HANDLED or EH_NOT_HANDLED
  */
-enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
+enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 {
 	struct Scsi_Host *host = cmd->device->host;
 	struct ata_port *ap = ata_shost_to_port(host);
 	unsigned long flags;
 	struct ata_queued_cmd *qc;
-	enum scsi_eh_timer_return ret;
+	enum blk_eh_timer_return ret;
 
 	DPRINTK("ENTER\n");
 
 	if (ap->ops->error_handler) {
-		ret = EH_NOT_HANDLED;
+		ret = BLK_EH_NOT_HANDLED;
 		goto out;
 	}
 
-	ret = EH_HANDLED;
+	ret = BLK_EH_HANDLED;
 	spin_lock_irqsave(ap->lock, flags);
 	qc = ata_qc_from_tag(ap, ap->link.active_tag);
 	if (qc) {
 		WARN_ON(qc->scsicmd != cmd);
 		qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
 		qc->err_mask |= AC_ERR_TIMEOUT;
-		ret = EH_NOT_HANDLED;
+		ret = BLK_EH_NOT_HANDLED;
 	}
 	spin_unlock_irqrestore(ap->lock, flags);
 
@@ -831,7 +832,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 	 * Note that ATA_QCFLAG_FAILED is unconditionally set after
 	 * this function completes.
 	 */
-	scsi_req_abort_cmd(qc->scsicmd);
+	blk_abort_request(qc->scsicmd->request);
 }
 
 /**
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index ade5c75b6144..24f5005478b0 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -152,7 +152,7 @@ extern int ata_bus_probe(struct ata_port *ap);
 /* libata-eh.c */
 extern unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd);
 extern void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd);
-extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
+extern enum blk_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
 extern void ata_port_wait_eh(struct ata_port *ap);
 extern void ata_eh_fastdrain_timerfn(unsigned long arg);
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index aa4e77c25273..8abfd06b5a72 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -1139,7 +1139,7 @@ static struct aac_srb * aac_scsi_common(struct fib * fib, struct scsi_cmnd * cmd
 	srbcmd->id       = cpu_to_le32(scmd_id(cmd));
 	srbcmd->lun      = cpu_to_le32(cmd->device->lun);
 	srbcmd->flags    = cpu_to_le32(flag);
-	timeout = cmd->timeout_per_command/HZ;
+	timeout = cmd->request->timeout/HZ;
 	if (timeout == 0)
 		timeout = 1;
 	srbcmd->timeout  = cpu_to_le32(timeout);  // timeout in seconds
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 822d5214692b..c387c15a2128 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -464,7 +464,6 @@ int __gdth_execute(struct scsi_device *sdev, gdth_cmd_str *gdtcmd, char *cmnd,
 
     /* use request field to save the ptr. to completion struct. */
     scp->request = (struct request *)&wait;
-    scp->timeout_per_command = timeout*HZ;
     scp->cmd_len = 12;
     scp->cmnd = cmnd;
     cmndinfo.priority = IOCTL_PRI;
@@ -1995,23 +1994,12 @@ static void gdth_putq(gdth_ha_str *ha, Scsi_Cmnd *scp, unchar priority)
     register Scsi_Cmnd *pscp;
     register Scsi_Cmnd *nscp;
     ulong flags;
-    unchar b, t;
 
     TRACE(("gdth_putq() priority %d\n",priority));
     spin_lock_irqsave(&ha->smp_lock, flags);
 
-    if (!cmndinfo->internal_command) {
+    if (!cmndinfo->internal_command)
         cmndinfo->priority = priority;
-        b = scp->device->channel;
-        t = scp->device->id;
-        if (priority >= DEFAULT_PRI) {
-            if ((b != ha->virt_bus && ha->raw[BUS_L2P(ha,b)].lock) ||
-                (b==ha->virt_bus && t<MAX_HDRIVES && ha->hdr[t].lock)) {
-                TRACE2(("gdth_putq(): locked IO ->update_timeout()\n"));
-                cmndinfo->timeout = gdth_update_timeout(scp, 0);
-            }
-        }
-    }
 
     if (ha->req_first==NULL) {
         ha->req_first = scp;                    /* queue was empty */
@@ -3899,6 +3887,39 @@ static const char *gdth_info(struct Scsi_Host *shp)
     return ((const char *)ha->binfo.type_string);
 }
 
+static enum blk_eh_timer_return gdth_timed_out(struct scsi_cmnd *scp)
+{
+	gdth_ha_str *ha = shost_priv(scp->device->host);
+	struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
+	unchar b, t;
+	ulong flags;
+	enum blk_eh_timer_return retval = BLK_EH_NOT_HANDLED;
+
+	TRACE(("%s() cmd 0x%x\n", scp->cmnd[0], __func__));
+	b = scp->device->channel;
+	t = scp->device->id;
+
+	/*
+	 * We don't really honor the command timeout, but we try to
+	 * honor 6 times of the actual command timeout! So reset the
+	 * timer if this is less than 6th timeout on this command!
+	 */
+	if (++cmndinfo->timeout_count < 6)
+		retval = BLK_EH_RESET_TIMER;
+
+	/* Reset the timeout if it is locked IO */
+	spin_lock_irqsave(&ha->smp_lock, flags);
+	if ((b != ha->virt_bus && ha->raw[BUS_L2P(ha, b)].lock) ||
+	    (b == ha->virt_bus && t < MAX_HDRIVES && ha->hdr[t].lock)) {
+		TRACE2(("%s(): locked IO, reset timeout\n", __func__));
+		retval = BLK_EH_RESET_TIMER;
+	}
+	spin_unlock_irqrestore(&ha->smp_lock, flags);
+
+	return retval;
+}
+
+
 static int gdth_eh_bus_reset(Scsi_Cmnd *scp)
 {
     gdth_ha_str *ha = shost_priv(scp->device->host);
@@ -3992,7 +4013,7 @@ static int gdth_queuecommand(struct scsi_cmnd *scp,
     BUG_ON(!cmndinfo);
 
     scp->scsi_done = done;
-    gdth_update_timeout(scp, scp->timeout_per_command * 6);
+    cmndinfo->timeout_count = 0;
     cmndinfo->priority = DEFAULT_PRI;
 
     return __gdth_queuecommand(ha, scp, cmndinfo);
@@ -4096,12 +4117,10 @@ static int ioc_lockdrv(void __user *arg)
             ha->hdr[j].lock = 1;
             spin_unlock_irqrestore(&ha->smp_lock, flags);
             gdth_wait_completion(ha, ha->bus_cnt, j);
-            gdth_stop_timeout(ha, ha->bus_cnt, j);
         } else {
             spin_lock_irqsave(&ha->smp_lock, flags);
             ha->hdr[j].lock = 0;
             spin_unlock_irqrestore(&ha->smp_lock, flags);
-            gdth_start_timeout(ha, ha->bus_cnt, j);
             gdth_next(ha);
         }
     } 
@@ -4539,18 +4558,14 @@ static int gdth_ioctl(struct inode *inode, struct file *filep,
                 spin_lock_irqsave(&ha->smp_lock, flags);
                 ha->raw[i].lock = 1;
                 spin_unlock_irqrestore(&ha->smp_lock, flags);
-                for (j = 0; j < ha->tid_cnt; ++j) {
+		for (j = 0; j < ha->tid_cnt; ++j)
                     gdth_wait_completion(ha, i, j);
-                    gdth_stop_timeout(ha, i, j);
-                }
             } else {
                 spin_lock_irqsave(&ha->smp_lock, flags);
                 ha->raw[i].lock = 0;
                 spin_unlock_irqrestore(&ha->smp_lock, flags);
-                for (j = 0; j < ha->tid_cnt; ++j) {
-                    gdth_start_timeout(ha, i, j);
+		for (j = 0; j < ha->tid_cnt; ++j)
                     gdth_next(ha);
-                }
             }
         } 
         break;
@@ -4644,6 +4659,7 @@ static struct scsi_host_template gdth_template = {
         .slave_configure        = gdth_slave_configure,
         .bios_param             = gdth_bios_param,
         .proc_info              = gdth_proc_info,
+	.eh_timed_out		= gdth_timed_out,
         .proc_name              = "gdth",
         .can_queue              = GDTH_MAXCMDS,
         .this_id                = -1,
diff --git a/drivers/scsi/gdth.h b/drivers/scsi/gdth.h
index ca92476727cf..1646444e9bd5 100644
--- a/drivers/scsi/gdth.h
+++ b/drivers/scsi/gdth.h
@@ -916,7 +916,7 @@ typedef struct {
         gdth_cmd_str *internal_cmd_str;         /* crier for internal messages*/
         dma_addr_t sense_paddr;                 /* sense dma-addr */
         unchar priority;
-        int timeout;
+	int timeout_count;			/* # of timeout calls */
         volatile int wait_for_completion;
         ushort status;
         ulong32 info;
diff --git a/drivers/scsi/gdth_proc.c b/drivers/scsi/gdth_proc.c
index ce0228e26aec..59349a316e13 100644
--- a/drivers/scsi/gdth_proc.c
+++ b/drivers/scsi/gdth_proc.c
@@ -748,69 +748,3 @@ static void gdth_wait_completion(gdth_ha_str *ha, int busnum, int id)
     }
     spin_unlock_irqrestore(&ha->smp_lock, flags);
 }
-
-static void gdth_stop_timeout(gdth_ha_str *ha, int busnum, int id)
-{
-    ulong flags;
-    Scsi_Cmnd *scp;
-    unchar b, t;
-
-    spin_lock_irqsave(&ha->smp_lock, flags);
-
-    for (scp = ha->req_first; scp; scp = (Scsi_Cmnd *)scp->SCp.ptr) {
-        struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
-        if (!cmndinfo->internal_command) {
-            b = scp->device->channel;
-            t = scp->device->id;
-            if (t == (unchar)id && b == (unchar)busnum) {
-                TRACE2(("gdth_stop_timeout(): update_timeout()\n"));
-                cmndinfo->timeout = gdth_update_timeout(scp, 0);
-            }
-        }
-    }
-    spin_unlock_irqrestore(&ha->smp_lock, flags);
-}
-
-static void gdth_start_timeout(gdth_ha_str *ha, int busnum, int id)
-{
-    ulong flags;
-    Scsi_Cmnd *scp;
-    unchar b, t;
-
-    spin_lock_irqsave(&ha->smp_lock, flags);
-
-    for (scp = ha->req_first; scp; scp = (Scsi_Cmnd *)scp->SCp.ptr) {
-        struct gdth_cmndinfo *cmndinfo = gdth_cmnd_priv(scp);
-        if (!cmndinfo->internal_command) {
-            b = scp->device->channel;
-            t = scp->device->id;
-            if (t == (unchar)id && b == (unchar)busnum) {
-                TRACE2(("gdth_start_timeout(): update_timeout()\n"));
-                gdth_update_timeout(scp, cmndinfo->timeout);
-            }
-        }
-    }
-    spin_unlock_irqrestore(&ha->smp_lock, flags);
-}
-
-static int gdth_update_timeout(Scsi_Cmnd *scp, int timeout)
-{
-    int oldto;
-
-    oldto = scp->timeout_per_command;
-    scp->timeout_per_command = timeout;
-
-    if (timeout == 0) {
-        del_timer(&scp->eh_timeout);
-        scp->eh_timeout.data = (unsigned long) NULL;
-        scp->eh_timeout.expires = 0;
-    } else {
-        if (scp->eh_timeout.data != (unsigned long) NULL) 
-            del_timer(&scp->eh_timeout);
-        scp->eh_timeout.data = (unsigned long) scp;
-        scp->eh_timeout.expires = jiffies + timeout;
-        add_timer(&scp->eh_timeout);
-    }
-
-    return oldto;
-}
diff --git a/drivers/scsi/gdth_proc.h b/drivers/scsi/gdth_proc.h
index 45e6fdacf36e..9b900cc9ebe8 100644
--- a/drivers/scsi/gdth_proc.h
+++ b/drivers/scsi/gdth_proc.h
@@ -20,9 +20,6 @@ static char *gdth_ioctl_alloc(gdth_ha_str *ha, int size, int scratch,
                               ulong64 *paddr);
 static void gdth_ioctl_free(gdth_ha_str *ha, int size, char *buf, ulong64 paddr);
 static void gdth_wait_completion(gdth_ha_str *ha, int busnum, int id);
-static void gdth_stop_timeout(gdth_ha_str *ha, int busnum, int id);
-static void gdth_start_timeout(gdth_ha_str *ha, int busnum, int id);
-static int gdth_update_timeout(Scsi_Cmnd *scp, int timeout);
 
 #endif
 
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 7b1502c0ab6e..87e09f35d3d4 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -756,7 +756,7 @@ static int ibmvscsi_queuecommand(struct scsi_cmnd *cmnd,
 	init_event_struct(evt_struct,
 			  handle_cmd_rsp,
 			  VIOSRP_SRP_FORMAT,
-			  cmnd->timeout_per_command/HZ);
+			  cmnd->request->timeout/HZ);
 
 	evt_struct->cmnd = cmnd;
 	evt_struct->cmnd_done = done;
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 461331d3dc45..81c16cba5417 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -612,7 +612,7 @@ static int idescsi_queue (struct scsi_cmnd *cmd,
 	pc->req_xfer = pc->buf_size = scsi_bufflen(cmd);
 	pc->scsi_cmd = cmd;
 	pc->done = done;
-	pc->timeout = jiffies + cmd->timeout_per_command;
+	pc->timeout = jiffies + cmd->request->timeout;
 
 	if (test_bit(IDESCSI_LOG_CMD, &scsi->log)) {
 		printk ("ide-scsi: %s: que %lu, cmd = ", drive->name, cmd->serial_number);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index e7a3a6554425..d30eb7ba018e 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3670,7 +3670,8 @@ static int ipr_slave_configure(struct scsi_device *sdev)
 			sdev->no_uld_attach = 1;
 		}
 		if (ipr_is_vset_device(res)) {
-			sdev->timeout = IPR_VSET_RW_TIMEOUT;
+			blk_queue_rq_timeout(sdev->request_queue,
+					     IPR_VSET_RW_TIMEOUT);
 			blk_queue_max_sectors(sdev->request_queue, IPR_VSET_MAX_SECTORS);
 		}
 		if (ipr_is_vset_device(res) || ipr_is_scsi_disk(res))
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index bc9e6ddf41df..ef683f0d2b5a 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -3818,7 +3818,7 @@ ips_send_cmd(ips_ha_t * ha, ips_scb_t * scb)
 		scb->cmd.dcdb.segment_4G = 0;
 		scb->cmd.dcdb.enhanced_sg = 0;
 
-		TimeOut = scb->scsi_cmd->timeout_per_command;
+		TimeOut = scb->scsi_cmd->request->timeout;
 
 		if (ha->subsys->param[4] & 0x00100000) {	/* If NEW Tape DCDB is Supported */
 			if (!scb->sg_len) {
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 299e075a7b34..1eca82420aab 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1476,12 +1476,12 @@ static void iscsi_start_tx(struct iscsi_conn *conn)
 		scsi_queue_work(conn->session->host, &conn->xmitwork);
 }
 
-static enum scsi_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
+static enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
 {
 	struct iscsi_cls_session *cls_session;
 	struct iscsi_session *session;
 	struct iscsi_conn *conn;
-	enum scsi_eh_timer_return rc = EH_NOT_HANDLED;
+	enum blk_eh_timer_return rc = BLK_EH_NOT_HANDLED;
 
 	cls_session = starget_to_session(scsi_target(scmd->device));
 	session = cls_session->dd_data;
@@ -1494,14 +1494,14 @@ static enum scsi_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
 		 * We are probably in the middle of iscsi recovery so let
 		 * that complete and handle the error.
 		 */
-		rc = EH_RESET_TIMER;
+		rc = BLK_EH_RESET_TIMER;
 		goto done;
 	}
 
 	conn = session->leadconn;
 	if (!conn) {
 		/* In the middle of shuting down */
-		rc = EH_RESET_TIMER;
+		rc = BLK_EH_RESET_TIMER;
 		goto done;
 	}
 
@@ -1513,20 +1513,21 @@ static enum scsi_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *scmd)
 	 */
 	if (time_before_eq(conn->last_recv + (conn->recv_timeout * HZ) +
 			    (conn->ping_timeout * HZ), jiffies))
-		rc = EH_RESET_TIMER;
+		rc = BLK_EH_RESET_TIMER;
 	/*
 	 * if we are about to check the transport then give the command
 	 * more time
 	 */
 	if (time_before_eq(conn->last_recv + (conn->recv_timeout * HZ),
 			   jiffies))
-		rc = EH_RESET_TIMER;
+		rc = BLK_EH_RESET_TIMER;
 	/* if in the middle of checking the transport then give us more time */
 	if (conn->ping_task)
-		rc = EH_RESET_TIMER;
+		rc = BLK_EH_RESET_TIMER;
 done:
 	spin_unlock(&session->lock);
-	debug_scsi("return %s\n", rc == EH_RESET_TIMER ? "timer reset" : "nh");
+	debug_scsi("return %s\n", rc == BLK_EH_RESET_TIMER ?
+					"timer reset" : "nh");
 	return rc;
 }
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 48ee8c7f5bdd..837b095ba90d 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -398,7 +398,7 @@ void sas_ata_task_abort(struct sas_task *task)
 
 	/* Bounce SCSI-initiated commands to the SCSI EH */
 	if (qc->scsicmd) {
-		scsi_req_abort_cmd(qc->scsicmd);
+		blk_abort_request(qc->scsicmd->request);
 		scsi_schedule_eh(qc->scsicmd->device->host);
 		return;
 	}
diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
index b4f9368f116a..0001374bd6b2 100644
--- a/drivers/scsi/libsas/sas_internal.h
+++ b/drivers/scsi/libsas/sas_internal.h
@@ -55,7 +55,7 @@ void sas_unregister_phys(struct sas_ha_struct *sas_ha);
 int  sas_register_ports(struct sas_ha_struct *sas_ha);
 void sas_unregister_ports(struct sas_ha_struct *sas_ha);
 
-enum scsi_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *);
+enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *);
 
 int  sas_init_queue(struct sas_ha_struct *sas_ha);
 int  sas_init_events(struct sas_ha_struct *sas_ha);
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index a8e3ef309070..744838780ada 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -673,43 +673,43 @@ out:
 	return;
 }
 
-enum scsi_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
+enum blk_eh_timer_return sas_scsi_timed_out(struct scsi_cmnd *cmd)
 {
 	struct sas_task *task = TO_SAS_TASK(cmd);
 	unsigned long flags;
 
 	if (!task) {
-		cmd->timeout_per_command /= 2;
+		cmd->request->timeout /= 2;
 		SAS_DPRINTK("command 0x%p, task 0x%p, gone: %s\n",
-			    cmd, task, (cmd->timeout_per_command ?
-			    "EH_RESET_TIMER" : "EH_NOT_HANDLED"));
-		if (!cmd->timeout_per_command)
-			return EH_NOT_HANDLED;
-		return EH_RESET_TIMER;
+			    cmd, task, (cmd->request->timeout ?
+			    "BLK_EH_RESET_TIMER" : "BLK_EH_NOT_HANDLED"));
+		if (!cmd->request->timeout)
+			return BLK_EH_NOT_HANDLED;
+		return BLK_EH_RESET_TIMER;
 	}
 
 	spin_lock_irqsave(&task->task_state_lock, flags);
 	BUG_ON(task->task_state_flags & SAS_TASK_STATE_ABORTED);
 	if (task->task_state_flags & SAS_TASK_STATE_DONE) {
 		spin_unlock_irqrestore(&task->task_state_lock, flags);
-		SAS_DPRINTK("command 0x%p, task 0x%p, timed out: EH_HANDLED\n",
-			    cmd, task);
-		return EH_HANDLED;
+		SAS_DPRINTK("command 0x%p, task 0x%p, timed out: "
+			    "BLK_EH_HANDLED\n", cmd, task);
+		return BLK_EH_HANDLED;
 	}
 	if (!(task->task_state_flags & SAS_TASK_AT_INITIATOR)) {
 		spin_unlock_irqrestore(&task->task_state_lock, flags);
 		SAS_DPRINTK("command 0x%p, task 0x%p, not at initiator: "
-			    "EH_RESET_TIMER\n",
+			    "BLK_EH_RESET_TIMER\n",
 			    cmd, task);
-		return EH_RESET_TIMER;
+		return BLK_EH_RESET_TIMER;
 	}
 	task->task_state_flags |= SAS_TASK_STATE_ABORTED;
 	spin_unlock_irqrestore(&task->task_state_lock, flags);
 
-	SAS_DPRINTK("command 0x%p, task 0x%p, timed out: EH_NOT_HANDLED\n",
+	SAS_DPRINTK("command 0x%p, task 0x%p, timed out: BLK_EH_NOT_HANDLED\n",
 		    cmd, task);
 
-	return EH_NOT_HANDLED;
+	return BLK_EH_NOT_HANDLED;
 }
 
 int sas_ioctl(struct scsi_device *sdev, int cmd, void __user *arg)
@@ -1039,7 +1039,7 @@ void sas_task_abort(struct sas_task *task)
 		return;
 	}
 
-	scsi_req_abort_cmd(sc);
+	blk_abort_request(sc->request);
 	scsi_schedule_eh(sc->device->host);
 }
 
diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c
index 97b763378e7d..afe1de998763 100644
--- a/drivers/scsi/megaraid/megaraid_sas.c
+++ b/drivers/scsi/megaraid/megaraid_sas.c
@@ -1167,7 +1167,7 @@ static int megasas_generic_reset(struct scsi_cmnd *scmd)
  * cmd has not been completed within the timeout period.
  */
 static enum
-scsi_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
+blk_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
 {
 	struct megasas_cmd *cmd = (struct megasas_cmd *)scmd->SCp.ptr;
 	struct megasas_instance *instance;
@@ -1175,7 +1175,7 @@ scsi_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
 
 	if (time_after(jiffies, scmd->jiffies_at_alloc +
 				(MEGASAS_DEFAULT_CMD_TIMEOUT * 2) * HZ)) {
-		return EH_NOT_HANDLED;
+		return BLK_EH_NOT_HANDLED;
 	}
 
 	instance = cmd->instance;
@@ -1189,7 +1189,7 @@ scsi_eh_timer_return megasas_reset_timer(struct scsi_cmnd *scmd)
 
 		spin_unlock_irqrestore(instance->host->host_lock, flags);
 	}
-	return EH_RESET_TIMER;
+	return BLK_EH_RESET_TIMER;
 }
 
 /**
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index c57c94c0ffd2..3b7240e40819 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -4170,8 +4170,8 @@ static int ncr_queue_command (struct ncb *np, struct scsi_cmnd *cmd)
 	**
 	**----------------------------------------------------
 	*/
-	if (np->settle_time && cmd->timeout_per_command >= HZ) {
-		u_long tlimit = jiffies + cmd->timeout_per_command - HZ;
+	if (np->settle_time && cmd->request->timeout >= HZ) {
+		u_long tlimit = jiffies + cmd->request->timeout - HZ;
 		if (time_after(np->settle_time, tlimit))
 			np->settle_time = tlimit;
 	}
diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c
index 37f9ba0cd798..b6cd12b2e996 100644
--- a/drivers/scsi/qla1280.c
+++ b/drivers/scsi/qla1280.c
@@ -2845,7 +2845,7 @@ qla1280_64bit_start_scsi(struct scsi_qla_host *ha, struct srb * sp)
 	memset(((char *)pkt + 8), 0, (REQUEST_ENTRY_SIZE - 8));
 
 	/* Set ISP command timeout. */
-	pkt->timeout = cpu_to_le16(cmd->timeout_per_command/HZ);
+	pkt->timeout = cpu_to_le16(cmd->request->timeout/HZ);
 
 	/* Set device target ID and LUN */
 	pkt->lun = SCSI_LUN_32(cmd);
@@ -3114,7 +3114,7 @@ qla1280_32bit_start_scsi(struct scsi_qla_host *ha, struct srb * sp)
 	memset(((char *)pkt + 8), 0, (REQUEST_ENTRY_SIZE - 8));
 
 	/* Set ISP command timeout. */
-	pkt->timeout = cpu_to_le16(cmd->timeout_per_command/HZ);
+	pkt->timeout = cpu_to_le16(cmd->request->timeout/HZ);
 
 	/* Set device target ID and LUN */
 	pkt->lun = SCSI_LUN_32(cmd);
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 88bebb13bc52..de8279ad7d89 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1542,7 +1542,7 @@ static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd)
 	DEBUG2(printk(KERN_INFO
 		      "scsi%ld: DEVICE_RESET cmd=%p jiffies = 0x%lx, to=%x,"
 		      "dpc_flags=%lx, status=%x allowed=%d\n", ha->host_no,
-		      cmd, jiffies, cmd->timeout_per_command / HZ,
+		      cmd, jiffies, cmd->request->timeout / HZ,
 		      ha->dpc_flags, cmd->result, cmd->allowed));
 
 	/* FIXME: wait for hba to go online */
@@ -1598,7 +1598,7 @@ static int qla4xxx_eh_target_reset(struct scsi_cmnd *cmd)
 	DEBUG2(printk(KERN_INFO
 		      "scsi%ld: TARGET_DEVICE_RESET cmd=%p jiffies = 0x%lx, "
 		      "to=%x,dpc_flags=%lx, status=%x allowed=%d\n",
-		      ha->host_no, cmd, jiffies, cmd->timeout_per_command / HZ,
+		      ha->host_no, cmd, jiffies, cmd->request->timeout / HZ,
 		      ha->dpc_flags, cmd->result, cmd->allowed));
 
 	stat = qla4xxx_reset_target(ha, ddb_entry);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index ee6be596503d..dbeb86cafc0d 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -291,7 +291,6 @@ struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, gfp_t gfp_mask)
 		unsigned long flags;
 
 		cmd->device = dev;
-		init_timer(&cmd->eh_timeout);
 		INIT_LIST_HEAD(&cmd->list);
 		spin_lock_irqsave(&dev->list_lock, flags);
 		list_add_tail(&cmd->list, &dev->cmd_list);
@@ -652,14 +651,19 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	unsigned long timeout;
 	int rtn = 0;
 
+	/*
+	 * We will use a queued command if possible, otherwise we will
+	 * emulate the queuing and calling of completion function ourselves.
+	 */
+	atomic_inc(&cmd->device->iorequest_cnt);
+
 	/* check if the device is still usable */
 	if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
 		/* in SDEV_DEL we error all commands. DID_NO_CONNECT
 		 * returns an immediate error upwards, and signals
 		 * that the device is no longer present */
 		cmd->result = DID_NO_CONNECT << 16;
-		atomic_inc(&cmd->device->iorequest_cnt);
-		__scsi_done(cmd);
+		scsi_done(cmd);
 		/* return 0 (because the command has been processed) */
 		goto out;
 	}
@@ -672,6 +676,7 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 		 * future requests should not occur until the device 
 		 * transitions out of the suspend state.
 		 */
+
 		scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
 
 		SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));
@@ -714,20 +719,8 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 		host->resetting = 0;
 	}
 
-	/* 
-	 * AK: unlikely race here: for some reason the timer could
-	 * expire before the serial number is set up below.
-	 */
-	scsi_add_timer(cmd, cmd->timeout_per_command, scsi_times_out);
-
 	scsi_log_send(cmd);
 
-	/*
-	 * We will use a queued command if possible, otherwise we will
-	 * emulate the queuing and calling of completion function ourselves.
-	 */
-	atomic_inc(&cmd->device->iorequest_cnt);
-
 	/*
 	 * Before we queue this command, check if the command
 	 * length exceeds what the host adapter can handle.
@@ -744,6 +737,12 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	}
 
 	spin_lock_irqsave(host->host_lock, flags);
+	/*
+	 * AK: unlikely race here: for some reason the timer could
+	 * expire before the serial number is set up below.
+	 *
+	 * TODO: kill serial or move to blk layer
+	 */
 	scsi_cmd_get_serial(host, cmd); 
 
 	if (unlikely(host->shost_state == SHOST_DEL)) {
@@ -754,12 +753,8 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	}
 	spin_unlock_irqrestore(host->host_lock, flags);
 	if (rtn) {
-		if (scsi_delete_timer(cmd)) {
-			atomic_inc(&cmd->device->iodone_cnt);
-			scsi_queue_insert(cmd,
-					  (rtn == SCSI_MLQUEUE_DEVICE_BUSY) ?
-					  rtn : SCSI_MLQUEUE_HOST_BUSY);
-		}
+		scsi_queue_insert(cmd, (rtn == SCSI_MLQUEUE_DEVICE_BUSY) ?
+						rtn : SCSI_MLQUEUE_HOST_BUSY);
 		SCSI_LOG_MLQUEUE(3,
 		    printk("queuecommand : request rejected\n"));
 	}
@@ -769,24 +764,6 @@ int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return rtn;
 }
 
-/**
- * scsi_req_abort_cmd -- Request command recovery for the specified command
- * @cmd: pointer to the SCSI command of interest
- *
- * This function requests that SCSI Core start recovery for the
- * command by deleting the timer and adding the command to the eh
- * queue.  It can be called by either LLDDs or SCSI Core.  LLDDs who
- * implement their own error recovery MAY ignore the timeout event if
- * they generated scsi_req_abort_cmd.
- */
-void scsi_req_abort_cmd(struct scsi_cmnd *cmd)
-{
-	if (!scsi_delete_timer(cmd))
-		return;
-	scsi_times_out(cmd);
-}
-EXPORT_SYMBOL(scsi_req_abort_cmd);
-
 /**
  * scsi_done - Enqueue the finished SCSI command into the done queue.
  * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
@@ -802,42 +779,7 @@ EXPORT_SYMBOL(scsi_req_abort_cmd);
  */
 static void scsi_done(struct scsi_cmnd *cmd)
 {
-	/*
-	 * We don't have to worry about this one timing out anymore.
-	 * If we are unable to remove the timer, then the command
-	 * has already timed out.  In which case, we have no choice but to
-	 * let the timeout function run, as we have no idea where in fact
-	 * that function could really be.  It might be on another processor,
-	 * etc, etc.
-	 */
-	if (!scsi_delete_timer(cmd))
-		return;
-	__scsi_done(cmd);
-}
-
-/* Private entry to scsi_done() to complete a command when the timer
- * isn't running --- used by scsi_times_out */
-void __scsi_done(struct scsi_cmnd *cmd)
-{
-	struct request *rq = cmd->request;
-
-	/*
-	 * Set the serial numbers back to zero
-	 */
-	cmd->serial_number = 0;
-
-	atomic_inc(&cmd->device->iodone_cnt);
-	if (cmd->result)
-		atomic_inc(&cmd->device->ioerr_cnt);
-
-	BUG_ON(!rq);
-
-	/*
-	 * The uptodate/nbytes values don't matter, as we allow partial
-	 * completes and thus will check this in the softirq callback
-	 */
-	rq->completion_data = cmd;
-	blk_complete_request(rq);
+	blk_complete_request(cmd->request);
 }
 
 /* Move this to a header if it becomes more generally useful */
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 39ce3aba1dac..fecefa05cb62 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -111,70 +111,9 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
 	return ret;
 }
 
-/**
- * scsi_add_timer - Start timeout timer for a single scsi command.
- * @scmd:	scsi command that is about to start running.
- * @timeout:	amount of time to allow this command to run.
- * @complete:	timeout function to call if timer isn't canceled.
- *
- * Notes:
- *    This should be turned into an inline function.  Each scsi command
- *    has its own timer, and as it is added to the queue, we set up the
- *    timer.  When the command completes, we cancel the timer.
- */
-void scsi_add_timer(struct scsi_cmnd *scmd, int timeout,
-		    void (*complete)(struct scsi_cmnd *))
-{
-
-	/*
-	 * If the clock was already running for this command, then
-	 * first delete the timer.  The timer handling code gets rather
-	 * confused if we don't do this.
-	 */
-	if (scmd->eh_timeout.function)
-		del_timer(&scmd->eh_timeout);
-
-	scmd->eh_timeout.data = (unsigned long)scmd;
-	scmd->eh_timeout.expires = jiffies + timeout;
-	scmd->eh_timeout.function = (void (*)(unsigned long)) complete;
-
-	SCSI_LOG_ERROR_RECOVERY(5, printk("%s: scmd: %p, time:"
-					  " %d, (%p)\n", __func__,
-					  scmd, timeout, complete));
-
-	add_timer(&scmd->eh_timeout);
-}
-
-/**
- * scsi_delete_timer - Delete/cancel timer for a given function.
- * @scmd:	Cmd that we are canceling timer for
- *
- * Notes:
- *     This should be turned into an inline function.
- *
- * Return value:
- *     1 if we were able to detach the timer.  0 if we blew it, and the
- *     timer function has already started to run.
- */
-int scsi_delete_timer(struct scsi_cmnd *scmd)
-{
-	int rtn;
-
-	rtn = del_timer(&scmd->eh_timeout);
-
-	SCSI_LOG_ERROR_RECOVERY(5, printk("%s: scmd: %p,"
-					 " rtn: %d\n", __func__,
-					 scmd, rtn));
-
-	scmd->eh_timeout.data = (unsigned long)NULL;
-	scmd->eh_timeout.function = NULL;
-
-	return rtn;
-}
-
 /**
  * scsi_times_out - Timeout function for normal scsi commands.
- * @scmd:	Cmd that is timing out.
+ * @req:	request that is timing out.
  *
  * Notes:
  *     We do not need to lock this.  There is the potential for a race
@@ -182,9 +121,11 @@ int scsi_delete_timer(struct scsi_cmnd *scmd)
  *     normal completion function determines that the timer has already
  *     fired, then it mustn't do anything.
  */
-void scsi_times_out(struct scsi_cmnd *scmd)
+enum blk_eh_timer_return scsi_times_out(struct request *req)
 {
-	enum scsi_eh_timer_return (* eh_timed_out)(struct scsi_cmnd *);
+	struct scsi_cmnd *scmd = req->special;
+	enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
+	enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
 
 	scsi_log_completion(scmd, TIMEOUT_ERROR);
 
@@ -196,22 +137,20 @@ void scsi_times_out(struct scsi_cmnd *scmd)
 		eh_timed_out = NULL;
 
 	if (eh_timed_out)
-		switch (eh_timed_out(scmd)) {
-		case EH_HANDLED:
-			__scsi_done(scmd);
-			return;
-		case EH_RESET_TIMER:
-			scsi_add_timer(scmd, scmd->timeout_per_command,
-				       scsi_times_out);
-			return;
-		case EH_NOT_HANDLED:
+		rtn = eh_timed_out(scmd);
+		switch (rtn) {
+		case BLK_EH_NOT_HANDLED:
 			break;
+		default:
+			return rtn;
 		}
 
 	if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) {
 		scmd->result |= DID_TIME_OUT << 16;
-		__scsi_done(scmd);
+		return BLK_EH_HANDLED;
 	}
+
+	return BLK_EH_NOT_HANDLED;
 }
 
 /**
@@ -1793,7 +1732,6 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 
 	blk_rq_init(NULL, &req);
 	scmd->request = &req;
-	memset(&scmd->eh_timeout, 0, sizeof(scmd->eh_timeout));
 
 	scmd->cmnd = req.cmd;
 
@@ -1804,8 +1742,6 @@ scsi_reset_provider(struct scsi_device *dev, int flag)
 
 	scmd->sc_data_direction		= DMA_BIDIRECTIONAL;
 
-	init_timer(&scmd->eh_timeout);
-
 	spin_lock_irqsave(shost->host_lock, flags);
 	shost->tmf_in_progress = 1;
 	spin_unlock_irqrestore(shost->host_lock, flags);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 62307bd794a9..e7686500e9dd 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1181,7 +1181,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 	
 	cmd->transfersize = req->data_len;
 	cmd->allowed = req->retries;
-	cmd->timeout_per_command = req->timeout;
 	return BLKPREP_OK;
 }
 EXPORT_SYMBOL(scsi_setup_blk_pc_cmnd);
@@ -1416,17 +1415,26 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	spin_unlock(shost->host_lock);
 	spin_lock(sdev->request_queue->queue_lock);
 
-	__scsi_done(cmd);
+	blk_complete_request(req);
 }
 
 static void scsi_softirq_done(struct request *rq)
 {
-	struct scsi_cmnd *cmd = rq->completion_data;
-	unsigned long wait_for = (cmd->allowed + 1) * cmd->timeout_per_command;
+	struct scsi_cmnd *cmd = rq->special;
+	unsigned long wait_for = (cmd->allowed + 1) * rq->timeout;
 	int disposition;
 
 	INIT_LIST_HEAD(&cmd->eh_entry);
 
+	/*
+	 * Set the serial numbers back to zero
+	 */
+	cmd->serial_number = 0;
+
+	atomic_inc(&cmd->device->iodone_cnt);
+	if (cmd->result)
+		atomic_inc(&cmd->device->ioerr_cnt);
+
 	disposition = scsi_decide_disposition(cmd);
 	if (disposition != SUCCESS &&
 	    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
@@ -1675,6 +1683,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
 
 	blk_queue_prep_rq(q, scsi_prep_fn);
 	blk_queue_softirq_done(q, scsi_softirq_done);
+	blk_queue_rq_timed_out(q, scsi_times_out);
 	return q;
 }
 
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 79f0f7511204..6cddd5dd323c 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -4,6 +4,7 @@
 #include <linux/device.h>
 
 struct request_queue;
+struct request;
 struct scsi_cmnd;
 struct scsi_device;
 struct scsi_host_template;
@@ -27,7 +28,6 @@ extern void scsi_exit_hosts(void);
 extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd);
 extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
 extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
-extern void __scsi_done(struct scsi_cmnd *cmd);
 #ifdef CONFIG_SCSI_LOGGING
 void scsi_log_send(struct scsi_cmnd *cmd);
 void scsi_log_completion(struct scsi_cmnd *cmd, int disposition);
@@ -49,10 +49,7 @@ extern int __init scsi_init_devinfo(void);
 extern void scsi_exit_devinfo(void);
 
 /* scsi_error.c */
-extern void scsi_add_timer(struct scsi_cmnd *, int,
-		void (*)(struct scsi_cmnd *));
-extern int scsi_delete_timer(struct scsi_cmnd *);
-extern void scsi_times_out(struct scsi_cmnd *cmd);
+extern enum blk_eh_timer_return scsi_times_out(struct request *req);
 extern int scsi_error_handler(void *host);
 extern int scsi_decide_disposition(struct scsi_cmnd *cmd);
 extern void scsi_eh_wakeup(struct Scsi_Host *shost);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index ab3c71869be5..7f618ee5ecea 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -560,12 +560,15 @@ sdev_rd_attr (vendor, "%.8s\n");
 sdev_rd_attr (model, "%.16s\n");
 sdev_rd_attr (rev, "%.4s\n");
 
+/*
+ * TODO: can we make these symlinks to the block layer ones?
+ */
 static ssize_t
 sdev_show_timeout (struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct scsi_device *sdev;
 	sdev = to_scsi_device(dev);
-	return snprintf (buf, 20, "%d\n", sdev->timeout / HZ);
+	return snprintf(buf, 20, "%d\n", sdev->request_queue->rq_timeout / HZ);
 }
 
 static ssize_t
@@ -576,7 +579,7 @@ sdev_store_timeout (struct device *dev, struct device_attribute *attr,
 	int timeout;
 	sdev = to_scsi_device(dev);
 	sscanf (buf, "%d\n", &timeout);
-	sdev->timeout = timeout * HZ;
+	blk_queue_rq_timeout(sdev->request_queue, timeout * HZ);
 	return count;
 }
 static DEVICE_ATTR(timeout, S_IRUGO | S_IWUSR, sdev_show_timeout, sdev_store_timeout);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 56823fd1fb84..9168883d0dfe 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -1950,15 +1950,15 @@ static int fc_vport_match(struct attribute_container *cont,
  * Notes:
  *	This routine assumes no locks are held on entry.
  */
-static enum scsi_eh_timer_return
+static enum blk_eh_timer_return
 fc_timed_out(struct scsi_cmnd *scmd)
 {
 	struct fc_rport *rport = starget_to_rport(scsi_target(scmd->device));
 
 	if (rport->port_state == FC_PORTSTATE_BLOCKED)
-		return EH_RESET_TIMER;
+		return BLK_EH_RESET_TIMER;
 
-	return EH_NOT_HANDLED;
+	return BLK_EH_NOT_HANDLED;
 }
 
 /*
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cb115d1bf228..c0cf4acda7de 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -383,7 +383,6 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	sector_t block = rq->sector;
 	sector_t threshold;
 	unsigned int this_count = rq->nr_sectors;
-	unsigned int timeout = sdp->timeout;
 	int ret;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -584,7 +583,6 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	SCpnt->transfersize = sdp->sector_size;
 	SCpnt->underflow = this_count << 9;
 	SCpnt->allowed = SD_MAX_RETRIES;
-	SCpnt->timeout_per_command = timeout;
 
 	/*
 	 * This indicates that the command is ready from our end to be
@@ -1878,11 +1876,12 @@ static int sd_probe(struct device *dev)
 	sdkp->openers = 0;
 	sdkp->previous_state = 1;
 
-	if (!sdp->timeout) {
+	if (!sdp->request_queue->rq_timeout) {
 		if (sdp->type != TYPE_MOD)
-			sdp->timeout = SD_TIMEOUT;
+			blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
 		else
-			sdp->timeout = SD_MOD_TIMEOUT;
+			blk_queue_rq_timeout(sdp->request_queue,
+					     SD_MOD_TIMEOUT);
 	}
 
 	device_initialize(&sdkp->dev);
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 8dbe3798d5fd..0f17009c99d2 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -331,7 +331,7 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 
 static int sr_prep_fn(struct request_queue *q, struct request *rq)
 {
-	int block=0, this_count, s_size, timeout = SR_TIMEOUT;
+	int block = 0, this_count, s_size;
 	struct scsi_cd *cd;
 	struct scsi_cmnd *SCpnt;
 	struct scsi_device *sdp = q->queuedata;
@@ -461,7 +461,6 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	SCpnt->transfersize = cd->device->sector_size;
 	SCpnt->underflow = this_count << 9;
 	SCpnt->allowed = MAX_RETRIES;
-	SCpnt->timeout_per_command = timeout;
 
 	/*
 	 * This indicates that the command is ready from our end to be
@@ -620,6 +619,8 @@ static int sr_probe(struct device *dev)
 	disk->fops = &sr_bdops;
 	disk->flags = GENHD_FL_CD;
 
+	blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
+
 	cd->device = sdev;
 	cd->disk = disk;
 	cd->driver = &sr_template;
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index d39107b7669b..f4e6cde1fd0d 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -519,8 +519,8 @@ static int sym53c8xx_queue_command(struct scsi_cmnd *cmd,
 	 *  Shorten our settle_time if needed for 
 	 *  this command not to time out.
 	 */
-	if (np->s.settle_time_valid && cmd->timeout_per_command) {
-		unsigned long tlimit = jiffies + cmd->timeout_per_command;
+	if (np->s.settle_time_valid && cmd->request->timeout) {
+		unsigned long tlimit = jiffies + cmd->request->timeout;
 		tlimit -= SYM_CONF_TIMER_INTERVAL*2;
 		if (time_after(np->s.settle_time, tlimit)) {
 			np->s.settle_time = tlimit;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9c2549260427..067f28b80072 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -147,6 +147,7 @@ struct request {
 
 	unsigned int cmd_flags;
 	enum rq_cmd_type_bits cmd_type;
+	unsigned long atomic_flags;
 
 	/* Maintain bio traversal state for part by part I/O submission.
 	 * hard_* are block layer internals, no driver should touch them!
@@ -214,6 +215,8 @@ struct request {
 	void *data;
 	void *sense;
 
+	unsigned long deadline;
+	struct list_head timeout_list;
 	unsigned int timeout;
 	int retries;
 
@@ -266,6 +269,14 @@ typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 
+enum blk_eh_timer_return {
+	BLK_EH_NOT_HANDLED,
+	BLK_EH_HANDLED,
+	BLK_EH_RESET_TIMER,
+};
+
+typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
+
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
@@ -311,6 +322,7 @@ struct request_queue
 	merge_bvec_fn		*merge_bvec_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
+	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
 	/*
@@ -386,6 +398,10 @@ struct request_queue
 	unsigned int		nr_sorted;
 	unsigned int		in_flight;
 
+	unsigned int		rq_timeout;
+	struct timer_list	timeout;
+	struct list_head	timeout_list;
+
 	/*
 	 * sg stuff
 	 */
@@ -770,6 +786,8 @@ extern int blk_end_request_callback(struct request *rq, int error,
 				unsigned int nr_bytes,
 				int (drv_callback)(struct request *));
 extern void blk_complete_request(struct request *);
+extern void __blk_complete_request(struct request *);
+extern void blk_abort_request(struct request *);
 
 /*
  * blk_end_request() takes bytes instead of sectors as a complete size.
@@ -811,6 +829,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
+extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
+extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index f9f6e793575c..855bf95963e7 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -75,7 +75,6 @@ struct scsi_cmnd {
 
 	int retries;
 	int allowed;
-	int timeout_per_command;
 
 	unsigned char prot_op;
 	unsigned char prot_type;
@@ -86,7 +85,6 @@ struct scsi_cmnd {
 	/* These elements define the operation we are about to perform */
 	unsigned char *cmnd;
 
-	struct timer_list eh_timeout;	/* Used to time out the command. */
 
 	/* These elements define the operation we ultimately want to perform */
 	struct scsi_data_buffer sdb;
@@ -139,7 +137,6 @@ extern void scsi_put_command(struct scsi_cmnd *);
 extern void __scsi_put_command(struct Scsi_Host *, struct scsi_cmnd *,
 			       struct device *);
 extern void scsi_finish_command(struct scsi_cmnd *cmd);
-extern void scsi_req_abort_cmd(struct scsi_cmnd *cmd);
 
 extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
 				 size_t *offset, size_t *len);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 44a55d1bf530..d123ca84e732 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -43,13 +43,6 @@ struct blk_queue_tags;
 #define DISABLE_CLUSTERING 0
 #define ENABLE_CLUSTERING 1
 
-enum scsi_eh_timer_return {
-	EH_NOT_HANDLED,
-	EH_HANDLED,
-	EH_RESET_TIMER,
-};
-
-
 struct scsi_host_template {
 	struct module *module;
 	const char *name;
@@ -347,7 +340,7 @@ struct scsi_host_template {
 	 *
 	 * Status: OPTIONAL
 	 */
-	enum scsi_eh_timer_return (* eh_timed_out)(struct scsi_cmnd *);
+	enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
 
 	/*
 	 * Name of proc directory
diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h
index 490bd13a634c..0de32cd4e8a7 100644
--- a/include/scsi/scsi_transport.h
+++ b/include/scsi/scsi_transport.h
@@ -21,6 +21,7 @@
 #define SCSI_TRANSPORT_H
 
 #include <linux/transport_class.h>
+#include <linux/blkdev.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 
@@ -64,7 +65,7 @@ struct scsi_transport_template {
 	 *			begin counting again
 	 * EH_NOT_HANDLED	Begin normal error recovery
 	 */
-	enum scsi_eh_timer_return (* eh_timed_out)(struct scsi_cmnd *);
+	enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
 
 	/*
 	 * Used as callback for the completion of i_t_nexus request

From 11914a53d2ec2974a565311af327b8983d8c820d Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Sat, 13 Sep 2008 20:31:27 +0200
Subject: [PATCH 085/132] block: Add interface to abort queued requests

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c          | 22 ++++++++++++++++++++++
 block/elevator.c             | 13 +++++++++++++
 include/linux/blkdev.h       |  1 +
 include/linux/blktrace_api.h |  2 ++
 include/linux/elevator.h     |  1 +
 5 files changed, 39 insertions(+)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index b36d07bf0afb..6e5c781c5af1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -153,3 +153,25 @@ void blk_add_timer(struct request *req)
 	    time_before(expiry, q->timeout.expires))
 		mod_timer(&q->timeout, expiry);
 }
+
+/**
+ * blk_abort_queue -- Abort all request on given queue
+ * @queue:	pointer to queue
+ *
+ */
+void blk_abort_queue(struct request_queue *q)
+{
+	unsigned long flags;
+	struct request *rq, *tmp;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	elv_abort_queue(q);
+
+	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
+		blk_abort_request(rq);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+}
+EXPORT_SYMBOL_GPL(blk_abort_queue);
diff --git a/block/elevator.c b/block/elevator.c
index a91fc59edd01..8a74eedc3530 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -914,6 +914,19 @@ int elv_may_queue(struct request_queue *q, int rw)
 	return ELV_MQUEUE_MAY;
 }
 
+void elv_abort_queue(struct request_queue *q)
+{
+	struct request *rq;
+
+	while (!list_empty(&q->queue_head)) {
+		rq = list_entry_rq(q->queue_head.next);
+		rq->cmd_flags |= REQ_QUIET;
+		blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+		end_queued_request(rq, 0);
+	}
+}
+EXPORT_SYMBOL(elv_abort_queue);
+
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 067f28b80072..37781d6fe045 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -788,6 +788,7 @@ extern int blk_end_request_callback(struct request *rq, int error,
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
+extern void blk_abort_queue(struct request_queue *);
 
 /*
  * blk_end_request() takes bytes instead of sectors as a complete size.
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 27da2cc682ee..dcaf2452ed1f 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -48,6 +48,7 @@ enum blktrace_act {
 	__BLK_TA_SPLIT,			/* bio was split */
 	__BLK_TA_BOUNCE,		/* bio was bounced */
 	__BLK_TA_REMAP,			/* bio was remapped */
+	__BLK_TA_ABORT,			/* request aborted */
 };
 
 /*
@@ -78,6 +79,7 @@ enum blktrace_notify {
 #define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
 #define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
 #define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_ABORT		(__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
 
 #define BLK_TN_PROCESS		(__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP	(__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index bb791c311a56..92f6f634e3e6 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -112,6 +112,7 @@ extern struct request *elv_latter_request(struct request_queue *, struct request
 extern int elv_register_queue(struct request_queue *q);
 extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, int);
+extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
 extern void elv_put_request(struct request_queue *, struct request *);

From 224cb3e981f1b2f9f93dbd49eaef505d17d894c2 Mon Sep 17 00:00:00 2001
From: Mike Anderson <andmike@linux.vnet.ibm.com>
Date: Fri, 29 Aug 2008 09:36:09 +0200
Subject: [PATCH 086/132] dm: Call blk_abort_queue on failed paths

Signed-off-by: Mike Anderson <andmike@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/md/dm-mpath.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c2fcf28b4c70..3d3848132c69 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -33,6 +33,7 @@ struct pgpath {
 	unsigned fail_count;		/* Cumulative failure count */
 
 	struct dm_path path;
+	struct work_struct deactivate_path;
 };
 
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -112,6 +113,7 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
+static void deactivate_path(struct work_struct *work);
 
 
 /*-----------------------------------------------
@@ -122,8 +124,10 @@ static struct pgpath *alloc_pgpath(void)
 {
 	struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
-	if (pgpath)
+	if (pgpath) {
 		pgpath->path.is_active = 1;
+		INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+	}
 
 	return pgpath;
 }
@@ -133,6 +137,14 @@ static void free_pgpath(struct pgpath *pgpath)
 	kfree(pgpath);
 }
 
+static void deactivate_path(struct work_struct *work)
+{
+	struct pgpath *pgpath =
+		container_of(work, struct pgpath, deactivate_path);
+
+	blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
+}
+
 static struct priority_group *alloc_priority_group(void)
 {
 	struct priority_group *pg;
@@ -870,6 +882,7 @@ static int fail_path(struct pgpath *pgpath)
 		      pgpath->path.dev->name, m->nr_valid_paths);
 
 	queue_work(kmultipathd, &m->trigger_event);
+	queue_work(kmultipathd, &pgpath->deactivate_path);
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);

From a91a3a20e06621b9931793888583efe37db4e4e8 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:01 +0900
Subject: [PATCH 087/132] sg: rename sg_cmd_done sg_rq_end_io

old sg_rq_end_io() was used to wrap sg_cmd_done during converting sg
to use the block layer (in order to cover the difference
scsi_execute_async and blk_execute_rq_nowait). Now we don't need it so
let's remove it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 50c07bca727d..d18f90d1d9ad 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -177,7 +177,7 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
 
 static int sg_fasync(int fd, struct file *filp, int mode);
 /* tasklet or soft irq callback */
-static void sg_cmd_done(void *data, char *sense, int result, int resid);
+static void sg_rq_end_io(struct request *rq, int uptodate);
 static int sg_start_req(Sg_request *srp, unsigned char *cmd);
 static void sg_finish_rem_req(Sg_request * srp);
 static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
@@ -227,11 +227,6 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd)
 				  cmd, filp->f_mode & FMODE_WRITE);
 }
 
-static void sg_rq_end_io(struct request *rq, int uptodate)
-{
-	sg_cmd_done(rq->end_io_data, rq->sense, rq->errors, rq->data_len);
-}
-
 static int
 sg_open(struct inode *inode, struct file *filp)
 {
@@ -1257,16 +1252,19 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 	return 0;
 }
 
-/* This function is a "bottom half" handler that is called by the
- * mid level when a command is completed (or has failed). */
-static void
-sg_cmd_done(void *data, char *sense, int result, int resid)
+/*
+ * This function is a "bottom half" handler that is called by the mid
+ * level when a command is completed (or has failed).
+ */
+static void sg_rq_end_io(struct request *rq, int uptodate)
 {
-	Sg_request *srp = data;
+	struct sg_request *srp = rq->end_io_data;
 	Sg_device *sdp = NULL;
 	Sg_fd *sfp;
 	unsigned long iflags;
 	unsigned int ms;
+	char *sense;
+	int result, resid;
 
 	if (NULL == srp) {
 		printk(KERN_ERR "sg_cmd_done: NULL request\n");
@@ -1280,6 +1278,9 @@ sg_cmd_done(void *data, char *sense, int result, int resid)
 		return;
 	}
 
+	sense = rq->sense;
+	result = rq->errors;
+	resid = rq->data_len;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_cmd_done: %s, pack_id=%d, res=0x%x\n",
 		sdp->disk->disk_name, srp->header.pack_id, result));

From 7e56cb0f7e7a132803ffefa0a5a15fb2079afaf1 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:02 +0900
Subject: [PATCH 088/132] sg: remove SG_ALLOW_DIO_CODE define

sg had lots of the own functions for the direct IO but now sg uses the
block layer functions for it. There are only five lines for the direct
IO. SG_ALLOW_DIO_CODE define was used to compile out the direct IO
code but we don't need the define. If someone wants to remove the
direct IO code, he can do easily without the define.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index d18f90d1d9ad..2c30331abbed 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -68,7 +68,6 @@ static void sg_proc_cleanup(void);
 #endif
 
 #define SG_ALLOW_DIO_DEF 0
-#define SG_ALLOW_DIO_CODE /* compile out by commenting this define */
 
 #define SG_MAX_DEVS 32768
 
@@ -1674,13 +1673,12 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return 0;
 
-#ifdef SG_ALLOW_DIO_CODE
 	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
 	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
 	    (!sfp->parentdp->device->host->unchecked_isa_dma) &&
 	    blk_rq_aligned(q, hp->dxferp, dxfer_len))
 		return sg_build_direct(srp, sfp, dxfer_len);
-#endif
+
 	if ((!sg_res_in_use(sfp)) && (dxfer_len <= rsv_schp->bufflen))
 		sg_link_reserve(sfp, srp, dxfer_len);
 	else

From fd1c1de0766844af4cfc39298e109ad273e72a9e Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:03 +0900
Subject: [PATCH 089/132] sg: remove b_malloc_len in sg_scatter_hold struct

It's not used for anything useful after the block layer conversion.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 2c30331abbed..ccce31a400ea 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -116,7 +116,6 @@ typedef struct sg_scatter_hold { /* holding area for scsi scatter gather info */
 	unsigned short k_use_sg; /* Count of kernel scatter-gather pieces */
 	unsigned sglist_len; /* size of malloc'd scatter-gather list ++ */
 	unsigned bufflen;	/* Size of (aggregate) data buffer */
-	unsigned b_malloc_len;	/* actual len malloc'ed in buffer */
 	struct page **pages;
 	int page_order;
 	char dio_in_use;	/* 0->indirect IO (or mmap), 1->dio */
@@ -1986,7 +1985,6 @@ sg_link_reserve(Sg_fd * sfp, Sg_request * srp, int size)
 			req_schp->pages = rsv_schp->pages;
 
 			req_schp->bufflen = size;
-			req_schp->b_malloc_len = rsv_schp->b_malloc_len;
 			req_schp->page_order = rsv_schp->page_order;
 			break;
 		} else

From 44c7b0eaa041007066e30ab4869d5bbf8dad5989 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:04 +0900
Subject: [PATCH 090/132] sg: remove __sg_start_req

__sg_start_req() was used temporarily to call blk_get_request() during
converting sg to use the block layer.

Now sg always calls blk_get_request() so we can move blk_get_request()
to sg_start_req(). We don't need __sg_start_req anymore.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 38 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 25 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index ccce31a400ea..9a56c0d320bf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1626,14 +1626,23 @@ exit_sg(void)
 	idr_destroy(&sg_index_idr);
 }
 
-static int __sg_start_req(struct sg_request *srp, struct sg_io_hdr *hp,
-			  unsigned char *cmd)
+static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 {
-	struct sg_fd *sfp = srp->parentfp;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
+	int res = 0;
 	struct request *rq;
+	Sg_fd *sfp = srp->parentfp;
+	sg_io_hdr_t *hp = &srp->header;
+	int dxfer_len = (int) hp->dxfer_len;
+	int dxfer_dir = hp->dxfer_direction;
+	Sg_scatter_hold *req_schp = &srp->data;
+	Sg_scatter_hold *rsv_schp = &sfp->reserve;
+	struct request_queue *q = sfp->parentdp->device->request_queue;
+	struct rq_map_data map_data;
 	int rw = hp->dxfer_direction == SG_DXFER_TO_DEV ? WRITE : READ;
 
+	SCSI_LOG_TIMEOUT(4, printk(KERN_INFO "sg_start_req: dxfer_len=%d\n",
+				   dxfer_len));
+
 	rq = blk_get_request(q, rw, GFP_ATOMIC);
 	if (!rq)
 		return -ENOMEM;
@@ -1648,27 +1657,6 @@ static int __sg_start_req(struct sg_request *srp, struct sg_io_hdr *hp,
 	rq->sense = srp->sense_b;
 	rq->retries = SG_DEFAULT_RETRIES;
 
-	return 0;
-}
-
-static int sg_start_req(Sg_request *srp, unsigned char *cmd)
-{
-	int res;
-	Sg_fd *sfp = srp->parentfp;
-	sg_io_hdr_t *hp = &srp->header;
-	int dxfer_len = (int) hp->dxfer_len;
-	int dxfer_dir = hp->dxfer_direction;
-	Sg_scatter_hold *req_schp = &srp->data;
-	Sg_scatter_hold *rsv_schp = &sfp->reserve;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
-	struct rq_map_data map_data;
-
-	SCSI_LOG_TIMEOUT(4, printk("sg_start_req: dxfer_len=%d\n", dxfer_len));
-
-	res = __sg_start_req(srp, hp, cmd);
-	if (res)
-		return res;
-
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return 0;
 

From 626710c9d665ff381c7ec666b6a023f064ca5fef Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:05 +0900
Subject: [PATCH 091/132] sg: incorporate sg_build_direct into sg_start_req

Calling blk_rq_map_user() at a single place is better than at
different two places. It makes the code more understandable.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 82 ++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 48 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9a56c0d320bf..c0b6866eece9 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -202,7 +202,6 @@ static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id);
 static Sg_request *sg_add_request(Sg_fd * sfp);
 static int sg_remove_request(Sg_fd * sfp, Sg_request * srp);
 static int sg_res_in_use(Sg_fd * sfp);
-static int sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len);
 static Sg_device *sg_get_dev(int dev);
 #ifdef CONFIG_SCSI_PROC_FS
 static int sg_last_dev(void);
@@ -1628,16 +1627,17 @@ exit_sg(void)
 
 static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 {
-	int res = 0;
+	int res;
 	struct request *rq;
 	Sg_fd *sfp = srp->parentfp;
 	sg_io_hdr_t *hp = &srp->header;
 	int dxfer_len = (int) hp->dxfer_len;
 	int dxfer_dir = hp->dxfer_direction;
+	unsigned int iov_count = hp->iovec_count;
 	Sg_scatter_hold *req_schp = &srp->data;
 	Sg_scatter_hold *rsv_schp = &sfp->reserve;
 	struct request_queue *q = sfp->parentdp->device->request_queue;
-	struct rq_map_data map_data;
+	struct rq_map_data *md, map_data;
 	int rw = hp->dxfer_direction == SG_DXFER_TO_DEV ? WRITE : READ;
 
 	SCSI_LOG_TIMEOUT(4, printk(KERN_INFO "sg_start_req: dxfer_len=%d\n",
@@ -1660,38 +1660,43 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return 0;
 
-	if (sg_allow_dio && (hp->flags & SG_FLAG_DIRECT_IO) &&
-	    (dxfer_dir != SG_DXFER_UNKNOWN) && (0 == hp->iovec_count) &&
-	    (!sfp->parentdp->device->host->unchecked_isa_dma) &&
+	if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
+	    dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
+	    !sfp->parentdp->device->host->unchecked_isa_dma &&
 	    blk_rq_aligned(q, hp->dxferp, dxfer_len))
-		return sg_build_direct(srp, sfp, dxfer_len);
-
-	if ((!sg_res_in_use(sfp)) && (dxfer_len <= rsv_schp->bufflen))
-		sg_link_reserve(sfp, srp, dxfer_len);
+		md = NULL;
 	else
-		res = sg_build_indirect(req_schp, sfp, dxfer_len);
+		md = &map_data;
 
-	if (!res) {
-		struct request *rq = srp->rq;
-		Sg_scatter_hold *schp = &srp->data;
-		int iovec_count = (int) hp->iovec_count;
+	if (md) {
+		if (!sg_res_in_use(sfp) && dxfer_len <= rsv_schp->bufflen)
+			sg_link_reserve(sfp, srp, dxfer_len);
+		else {
+			res = sg_build_indirect(req_schp, sfp, dxfer_len);
+			if (res)
+				return res;
+		}
 
-		map_data.pages = schp->pages;
-		map_data.page_order = schp->page_order;
-		map_data.nr_entries = schp->k_use_sg;
-
-		if (iovec_count)
-			res = blk_rq_map_user_iov(q, rq, &map_data, hp->dxferp,
-						  iovec_count,
-						  hp->dxfer_len, GFP_ATOMIC);
-		else
-			res = blk_rq_map_user(q, rq, &map_data, hp->dxferp,
-					      hp->dxfer_len, GFP_ATOMIC);
-
-		if (!res)
-			srp->bio = rq->bio;
+		md->pages = req_schp->pages;
+		md->page_order = req_schp->page_order;
+		md->nr_entries = req_schp->k_use_sg;
 	}
 
+	if (iov_count)
+		res = blk_rq_map_user_iov(q, rq, md, hp->dxferp, iov_count,
+					  hp->dxfer_len, GFP_ATOMIC);
+	else
+		res = blk_rq_map_user(q, rq, md, hp->dxferp,
+				      hp->dxfer_len, GFP_ATOMIC);
+
+	if (!res) {
+		srp->bio = rq->bio;
+
+		if (!md) {
+			req_schp->dio_in_use = 1;
+			hp->info |= SG_INFO_DIRECT_IO;
+		}
+	}
 	return res;
 }
 
@@ -1730,25 +1735,6 @@ sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp, int tablesize)
 	return tablesize;	/* number of scat_gath elements allocated */
 }
 
-/* Returns: -ve -> error, 0 -> done, 1 -> try indirect */
-static int
-sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len)
-{
-	sg_io_hdr_t *hp = &srp->header;
-	Sg_scatter_hold *schp = &srp->data;
-	int res;
-	struct request *rq = srp->rq;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
-
-	res = blk_rq_map_user(q, rq, NULL, hp->dxferp, dxfer_len, GFP_ATOMIC);
-	if (res)
-		return res;
-	srp->bio = rq->bio;
-	schp->dio_in_use = 1;
-	hp->info |= SG_INFO_DIRECT_IO;
-	return 0;
-}
-
 static int
 sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size)
 {

From c3919af2354fff673026dcbeac6f009d2ce5ceee Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:06 +0900
Subject: [PATCH 092/132] sg: remove sg_write_xfer

sg_write_xfer was used to copy data from user space for WRITE
commands. blk_rq_map_user_iov and blk_rq_map_user do the job so
sg_write_xfer does nothing useful.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c0b6866eece9..07bd68331303 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -188,7 +188,6 @@ static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
 			int read_only, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
-static int sg_write_xfer(Sg_request * srp);
 static int sg_read_xfer(Sg_request * srp);
 static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer);
 static void sg_remove_scat(Sg_scatter_hold * schp);
@@ -736,11 +735,6 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp,
 		sg_finish_rem_req(srp);
 		return k;	/* probably out of space --> ENOMEM */
 	}
-	if ((k = sg_write_xfer(srp))) {
-		SCSI_LOG_TIMEOUT(1, printk("sg_common_write: write_xfer, bad address\n"));
-		sg_finish_rem_req(srp);
-		return k;
-	}
 	if (sdp->detached) {
 		sg_finish_rem_req(srp);
 		return -ENODEV;
@@ -1816,32 +1810,6 @@ out:
 	return -ENOMEM;
 }
 
-static int
-sg_write_xfer(Sg_request * srp)
-{
-	sg_io_hdr_t *hp = &srp->header;
-	Sg_scatter_hold *schp = &srp->data;
-	int num_xfer = 0;
-	int dxfer_dir = hp->dxfer_direction;
-	int new_interface = ('\0' == hp->interface_id) ? 0 : 1;
-
-	if ((SG_DXFER_UNKNOWN == dxfer_dir) || (SG_DXFER_TO_DEV == dxfer_dir) ||
-	    (SG_DXFER_TO_FROM_DEV == dxfer_dir)) {
-		num_xfer = (int) (new_interface ? hp->dxfer_len : hp->flags);
-		if (schp->bufflen < num_xfer)
-			num_xfer = schp->bufflen;
-	}
-	if ((num_xfer <= 0) || (schp->dio_in_use) ||
-	    (new_interface
-	     && ((SG_FLAG_NO_DXFER | SG_FLAG_MMAP_IO) & hp->flags)))
-		return 0;
-
-	SCSI_LOG_TIMEOUT(4, printk("sg_write_xfer: num_xfer=%d, k_use_sg=%d\n",
-			  num_xfer, schp->k_use_sg));
-
-	return 0;
-}
-
 static void
 sg_remove_scat(Sg_scatter_hold * schp)
 {

From 0b6cb26c6686f1f24607c41f0a6d21ce54191710 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:07 +0900
Subject: [PATCH 093/132] sg: remove sg_read_xfer

sg_read_xfer was used to copy data to user space for READ
commands. blk_rq_unmap_user does the job so sg_read_xfer does nothing
useful.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 33 +++++----------------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 07bd68331303..df8bf67b171a 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -188,7 +188,6 @@ static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
 			int read_only, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
-static int sg_read_xfer(Sg_request * srp);
 static int sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer);
 static void sg_remove_scat(Sg_scatter_hold * schp);
 static void sg_build_reserve(Sg_fd * sfp, int req_size);
@@ -523,8 +522,11 @@ sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp)
 		err = -EFAULT;
 		goto err_out;
 	}
-	err = sg_read_xfer(srp);
-      err_out:
+	if (srp->bio) {
+		err = blk_rq_unmap_user(srp->bio);
+		srp->bio = NULL;
+	}
+err_out:
 	sg_finish_rem_req(srp);
 	return (0 == err) ? count : err;
 }
@@ -1831,31 +1833,6 @@ sg_remove_scat(Sg_scatter_hold * schp)
 	memset(schp, 0, sizeof (*schp));
 }
 
-static int
-sg_read_xfer(Sg_request * srp)
-{
-	sg_io_hdr_t *hp = &srp->header;
-	Sg_scatter_hold *schp = &srp->data;
-	int num_xfer = 0;
-	int dxfer_dir = hp->dxfer_direction;
-	int new_interface = ('\0' == hp->interface_id) ? 0 : 1;
-
-	if ((SG_DXFER_UNKNOWN == dxfer_dir) || (SG_DXFER_FROM_DEV == dxfer_dir)
-	    || (SG_DXFER_TO_FROM_DEV == dxfer_dir)) {
-		num_xfer = hp->dxfer_len;
-		if (schp->bufflen < num_xfer)
-			num_xfer = schp->bufflen;
-	}
-	if ((num_xfer <= 0) || (schp->dio_in_use) ||
-	    (new_interface
-	     && ((SG_FLAG_NO_DXFER | SG_FLAG_MMAP_IO) & hp->flags)))
-		return 0;
-
-	SCSI_LOG_TIMEOUT(4, printk("sg_read_xfer: num_xfer=%d, iovec_count=%d, k_use_sg=%d\n",
-			  num_xfer, (int)hp->iovec_count, schp->k_use_sg));
-	return 0;
-}
-
 static int
 sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer)
 {

From 4677735f03f5b6b6f2182f457a921855cadfb85b Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 2 Sep 2008 22:50:08 +0900
Subject: [PATCH 094/132] sg: remove unnecessary blk_rq_unmap_user

blk_rq_unmap_user in sg_finish_rem_req can take care of all the cases.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/sg.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index df8bf67b171a..ba9b9bbd4e73 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -522,10 +522,6 @@ sg_new_read(Sg_fd * sfp, char __user *buf, size_t count, Sg_request * srp)
 		err = -EFAULT;
 		goto err_out;
 	}
-	if (srp->bio) {
-		err = blk_rq_unmap_user(srp->bio);
-		srp->bio = NULL;
-	}
 err_out:
 	sg_finish_rem_req(srp);
 	return (0 == err) ? count : err;
@@ -1844,9 +1840,6 @@ sg_read_oxfer(Sg_request * srp, char __user *outp, int num_read_xfer)
 	if ((!outp) || (num_read_xfer <= 0))
 		return 0;
 
-	blk_rq_unmap_user(srp->bio);
-	srp->bio = NULL;
-
 	num = 1 << (PAGE_SHIFT + schp->page_order);
 	for (k = 0; k < schp->k_use_sg && schp->pages[k]; k++) {
 		if (num > num_read_xfer) {

From 3e6053d76dcbd92b2f9f4ad5ece9bce83149523e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Thu, 11 Sep 2008 10:57:55 +0200
Subject: [PATCH 095/132] block: adjust blkdev_issue_discard for swap

Two mods to blkdev_issue_discard(), thinking ahead to its use on swap:

1. Add gfp_mask argument, so swap allocation can use it where GFP_KERNEL
   might deadlock but GFP_NOIO is safe.

2. Enlarge nr_sects argument from unsigned to sector_t: unsigned long is
   enough to cover a whole swap area, but sector_t suits any partition.

Change sb_issue_discard()'s nr_blocks to sector_t too; but no need seen
for a gfp_mask there, just pass GFP_KERNEL down to blkdev_issue_discard().

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 7 ++++---
 include/linux/blkdev.h | 9 +++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 988b63479b2f..5c99ff8d2db8 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -332,12 +332,13 @@ static void blkdev_discard_end_io(struct bio *bio, int err)
  * @bdev:	blockdev to issue discard for
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Issue a discard request for the sectors in question. Does not wait.
  */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-			 unsigned nr_sects)
+int blkdev_issue_discard(struct block_device *bdev,
+			 sector_t sector, sector_t nr_sects, gfp_t gfp_mask)
 {
 	struct request_queue *q;
 	struct bio *bio;
@@ -354,7 +355,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -EOPNOTSUPP;
 
 	while (nr_sects && !ret) {
-		bio = bio_alloc(GFP_KERNEL, 0);
+		bio = bio_alloc(gfp_mask, 0);
 		if (!bio)
 			return -ENOMEM;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 37781d6fe045..b47767c72ce3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
+#include <linux/gfp.h>
 #include <linux/bsg.h>
 #include <linux/smp.h>
 
@@ -873,15 +874,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
-				unsigned nr_sects);
+extern int blkdev_issue_discard(struct block_device *,
+				sector_t sector, sector_t nr_sects, gfp_t);
 
 static inline int sb_issue_discard(struct super_block *sb,
-				   sector_t block, unsigned nr_blocks)
+				   sector_t block, sector_t nr_blocks)
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
-	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks);
+	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL);
 }
 
 /*

From 0a0d96b03a1f3bfd6bc3ea08008699e8e59fccd9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 11 Sep 2008 13:17:37 +0200
Subject: [PATCH 096/132] block: add bio_kmalloc()

Not all callers need (or want!) the mempool backing guarentee, it
essentially means that you can only use bio_alloc() for short allocations
and not for preallocating some bio's at setup or init time.

So add bio_kmalloc() which does the same thing as bio_alloc(), except
it just uses kmalloc() as the backing instead of the bio mempools.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 96 +++++++++++++++++++++++++++++++++++----------
 include/linux/bio.h |  1 +
 2 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 355302985e22..e56e7685af9c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
 	struct bio_vec *bvl;
 
 	/*
-	 * see comment near bvec_array define!
+	 * If 'bs' is given, lookup the pool and do the mempool alloc.
+	 * If not, this is a bio_kmalloc() allocation and just do a
+	 * kzalloc() for the exact number of vecs right away.
 	 */
-	switch (nr) {
-		case   1        : *idx = 0; break;
-		case   2 ...   4: *idx = 1; break;
-		case   5 ...  16: *idx = 2; break;
-		case  17 ...  64: *idx = 3; break;
-		case  65 ... 128: *idx = 4; break;
-		case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+	if (bs) {
+		/*
+		 * see comment near bvec_array define!
+		 */
+		switch (nr) {
+		case 1:
+			*idx = 0;
+			break;
+		case 2 ... 4:
+			*idx = 1;
+			break;
+		case 5 ... 16:
+			*idx = 2;
+			break;
+		case 17 ... 64:
+			*idx = 3;
+			break;
+		case 65 ... 128:
+			*idx = 4;
+			break;
+		case 129 ... BIO_MAX_PAGES:
+			*idx = 5;
+			break;
 		default:
 			return NULL;
-	}
-	/*
-	 * idx now points to the pool we want to allocate from
-	 */
+		}
 
-	bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-	if (bvl)
-		memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+		/*
+		 * idx now points to the pool we want to allocate from
+		 */
+		bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+		if (bvl)
+			memset(bvl, 0,
+				bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+	} else
+		bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
 
 	return bvl;
 }
@@ -107,6 +128,12 @@ static void bio_fs_destructor(struct bio *bio)
 	bio_free(bio, fs_bio_set);
 }
 
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+	kfree(bio->bi_io_vec);
+	kfree(bio);
+}
+
 void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
@@ -119,19 +146,25 @@ void bio_init(struct bio *bio)
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
  * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from
+ * @bs:		the bio_set to allocate from. If %NULL, just use kmalloc
  *
  * Description:
- *   bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ *   bio_alloc_bioset will first try its own mempool to satisfy the allocation.
  *   If %__GFP_WAIT is set then we will block on the internal pool waiting
- *   for a &struct bio to become free.
+ *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ *   fall back to just using @kmalloc to allocate the required memory.
  *
  *   allocate bio and iovecs from the memory pools specified by the
- *   bio_set structure.
+ *   bio_set structure, or @kmalloc if none given.
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	struct bio *bio;
+
+	if (bs)
+		bio = mempool_alloc(bs->bio_pool, gfp_mask);
+	else
+		bio = kmalloc(sizeof(*bio), gfp_mask);
 
 	if (likely(bio)) {
 		struct bio_vec *bvl = NULL;
@@ -142,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
-				mempool_free(bio, bs->bio_pool);
+				if (bs)
+					mempool_free(bio, bs->bio_pool);
+				else
+					kfree(bio);
 				bio = NULL;
 				goto out;
 			}
@@ -165,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 	return bio;
 }
 
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+
+	if (bio)
+		bio->bi_destructor = bio_kmalloc_destructor;
+
+	return bio;
+}
+
 void zero_fill_bio(struct bio *bio)
 {
 	unsigned long flags;
@@ -1349,6 +1402,7 @@ static int __init init_bio(void)
 subsys_initcall(init_bio);
 
 EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
 EXPORT_SYMBOL(bio_put);
 EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7af373f253dc..6520ee1a3f6d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -308,6 +308,7 @@ extern struct bio_set *bioset_create(int, int);
 extern void bioset_free(struct bio_set *);
 
 extern struct bio *bio_alloc(gfp_t, int);
+extern struct bio *bio_kmalloc(gfp_t, int);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *, struct bio_set *);

From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 14 Sep 2008 05:56:33 -0700
Subject: [PATCH 097/132] block: add fault injection mechanism for faking
 request timeouts

Only works for the generic request timer handling. Allows one to
sporadically ignore request completions, thus exercising the timeout
handling.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-softirq.c    |  2 ++
 block/blk-timeout.c    | 59 ++++++++++++++++++++++++++++++++++++++++++
 block/blk.h            | 12 +++++++++
 block/genhd.c          |  8 ++++++
 include/linux/blkdev.h |  1 +
 lib/Kconfig.debug      | 13 +++++++++-
 6 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 7ab344afb16f..e660d26ca656 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -154,6 +154,8 @@ do_local:
  **/
 void blk_complete_request(struct request *req)
 {
+	if (unlikely(blk_should_fake_timeout(req->q)))
+		return;
 	if (!blk_mark_rq_complete(req))
 		__blk_complete_request(req);
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6e5c781c5af1..9b4ad138bb33 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -4,9 +4,68 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/fault-inject.h>
 
 #include "blk.h"
 
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+
+static DECLARE_FAULT_ATTR(fail_io_timeout);
+
+static int __init setup_fail_io_timeout(char *str)
+{
+	return setup_fault_attr(&fail_io_timeout, str);
+}
+__setup("fail_io_timeout=", setup_fail_io_timeout);
+
+int blk_should_fake_timeout(struct request_queue *q)
+{
+	if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
+		return 0;
+
+	return should_fail(&fail_io_timeout, 1);
+}
+
+static int __init fail_io_timeout_debugfs(void)
+{
+	return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout");
+}
+
+late_initcall(fail_io_timeout_debugfs);
+
+ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
+
+	return sprintf(buf, "%d\n", set != 0);
+}
+
+ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int val;
+
+	if (count) {
+		struct request_queue *q = disk->queue;
+		char *p = (char *) buf;
+
+		val = simple_strtoul(p, &p, 10);
+		spin_lock_irq(q->queue_lock);
+		if (val)
+			queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
+		else
+			queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	return count;
+}
+
+#endif /* CONFIG_FAIL_IO_TIMEOUT */
+
 /*
  * blk_delete_timer - Delete/cancel timer for a given function.
  * @req:	request that we are canceling timer for
diff --git a/block/blk.h b/block/blk.h
index a4f4a50aefaa..e5c579769963 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -42,6 +42,18 @@ static inline void blk_clear_rq_complete(struct request *rq)
 	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 }
 
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+int blk_should_fake_timeout(struct request_queue *);
+ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
+ssize_t part_timeout_store(struct device *, struct device_attribute *,
+				const char *, size_t);
+#else
+static inline int blk_should_fake_timeout(struct request_queue *q)
+{
+	return 0;
+}
+#endif
+
 struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
diff --git a/block/genhd.c b/block/genhd.c
index 8acaff0154e3..4cd3433c99ac 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -817,6 +817,11 @@ static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
 #endif
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+static struct device_attribute dev_attr_fail_timeout =
+	__ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
+		part_timeout_store);
+#endif
 
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
@@ -828,6 +833,9 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
+#endif
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+	&dev_attr_fail_timeout.attr,
 #endif
 	NULL
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b47767c72ce3..e34999d14c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -440,6 +440,7 @@ struct request_queue
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
+#define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c556896abe57..7d7a31d0ddeb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -683,10 +683,21 @@ config FAIL_PAGE_ALLOC
 
 config FAIL_MAKE_REQUEST
 	bool "Fault-injection capability for disk IO"
-	depends on FAULT_INJECTION
+	depends on FAULT_INJECTION && BLOCK
 	help
 	  Provide fault-injection capability for disk IO.
 
+config FAIL_IO_TIMEOUT
+	bool "Faul-injection capability for faking disk interrupts"
+	depends on FAULT_INJECTION && BLOCK
+	help
+	  Provide fault-injection capability on end IO handling. This
+	  will make the block layer "forget" an interrupt as configured,
+	  thus exercising the error handling.
+
+	  Only works with drivers that use the generic timeout handling,
+	  for others it wont do anything.
+
 config FAULT_INJECTION_DEBUG_FS
 	bool "Debugfs entries for fault-injection capabilities"
 	depends on FAULT_INJECTION && SYSFS && DEBUG_FS

From 7ba1fbaa4a478f72fbaf5a56af9c82a77966b4c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 16 Sep 2008 09:54:11 -0700
Subject: [PATCH 098/132] block: use rq complete marking in blk_abort_request()

We cannot abort a request if we raced with the timeout handler already,
or with the IO completion. So make blk_abort_request() mark the request
as complete, and only continue if we succeeded.

Found and suggested by Mike Anderson <andmike@linux.vnet.ibm.com>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 9b4ad138bb33..972a63f848fb 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -158,6 +158,8 @@ void blk_rq_timed_out_timer(unsigned long data)
  */
 void blk_abort_request(struct request *req)
 {
+	if (blk_mark_rq_complete(req))
+		return;
 	blk_delete_timer(req);
 	blk_rq_timed_out(req);
 }

From 9c02f2b02e29a2244e36c6e1f246080d8afc6cff Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 18 Sep 2008 09:31:53 -0700
Subject: [PATCH 099/132] block: cleanup some of the integrity stuff in
 blkdev.h

Don't put functions that are only used in fs/bio-integrity.c in
blkdev.h, it's much cleaner to just keep it in there. Also kill
completely unused bdev_get_tag_size()

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c     | 31 ++++++++++++++++++++++++++++++
 include/linux/blkdev.h | 43 ------------------------------------------
 2 files changed, 31 insertions(+), 43 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index c3e174b35fe6..ba4ada08564a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -150,6 +150,29 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+static struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
+{
+	return bdev->bd_disk->integrity;
+}
+
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_READ))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
+		return 1;
+
+	return 0;
+}
+
 /**
  * bio_integrity_enabled - Check whether integrity can be passed
  * @bio:	bio to check
@@ -313,6 +336,14 @@ static void bio_integrity_generate(struct bio *bio)
 	}
 }
 
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
 /**
  * bio_integrity_prep - Prepare bio for integrity I/O
  * @bio:	bio to prepare
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e34999d14c16..e23b838825bd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1004,47 +1004,6 @@ extern int blk_integrity_compare(struct block_device *, struct block_device *);
 extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 extern int blk_rq_count_integrity_sg(struct request *);
 
-static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
-{
-	if (bi)
-		return bi->tuple_size;
-
-	return 0;
-}
-
-static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-	return bdev->bd_disk->integrity;
-}
-
-static inline unsigned int bdev_get_tag_size(struct block_device *bdev)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bdev);
-
-	if (bi)
-		return bi->tag_size;
-
-	return 0;
-}
-
-static inline int bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bdev);
-
-	if (bi == NULL)
-		return 0;
-
-	if (rw == READ && bi->verify_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_READ))
-		return 1;
-
-	if (rw == WRITE && bi->generate_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_WRITE))
-		return 1;
-
-	return 0;
-}
-
 static inline int blk_integrity_rq(struct request *rq)
 {
 	if (rq->bio == NULL)
@@ -1058,8 +1017,6 @@ static inline int blk_integrity_rq(struct request *rq)
 #define blk_integrity_rq(rq)			(0)
 #define blk_rq_count_integrity_sg(a)		(0)
 #define blk_rq_map_integrity_sg(a, b)		(0)
-#define bdev_get_integrity(a)			(0)
-#define bdev_get_tag_size(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);

From 9246b5f06deeea541e7c62437c2ad19a0b1172c0 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 17 Sep 2008 14:30:32 -0700
Subject: [PATCH 100/132] block: Expand Xen blkfront for > 16 xvd

Until recently, the maximum number of xvd block devices you could attach
to a Xen domU was 16. This limitation turned out to be problematic for
some users, so it was expanded to handle a much larger number of disks.
However, this requires a couple of changes in the way that blkfront
scans for disks. This functionality is already present in the Xen
linux-2.6.18-xen.hg tree; the attached patch adds this functionality to
the mainline xen-blkfront implementation. I successfully tested it on a
2.6.25 tree, and build tested it on 2.6.27-rc3.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/xen-blkfront.c | 76 ++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 21 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 3ca643cafccd..bff602ccccf3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -105,15 +105,17 @@ static DEFINE_SPINLOCK(blkif_io_lock);
 #define GRANT_INVALID_REF	0
 
 #define PARTS_PER_DISK		16
+#define PARTS_PER_EXT_DISK      256
 
 #define BLKIF_MAJOR(dev) ((dev)>>8)
 #define BLKIF_MINOR(dev) ((dev) & 0xff)
 
-#define DEV_NAME	"xvd"	/* name in /dev */
+#define EXT_SHIFT 28
+#define EXTENDED (1<<EXT_SHIFT)
+#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 
-/* Information about our VBDs. */
-#define MAX_VBDS 64
-static LIST_HEAD(vbds_list);
+#define DEV_NAME	"xvd"	/* name in /dev */
 
 static int get_id_from_freelist(struct blkfront_info *info)
 {
@@ -386,31 +388,60 @@ static int xlvbd_barrier(struct blkfront_info *info)
 }
 
 
-static int xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity,
-			       int vdevice, u16 vdisk_info, u16 sector_size,
-			       struct blkfront_info *info)
+static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+			       struct blkfront_info *info,
+			       u16 vdisk_info, u16 sector_size)
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
 	int err = -ENODEV;
+	unsigned int offset;
+	int minor;
+	int nr_parts;
 
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
 
-	if ((minor % PARTS_PER_DISK) == 0)
-		nr_minors = PARTS_PER_DISK;
+	if ((info->vdevice>>EXT_SHIFT) > 1) {
+		/* this is above the extended range; something is wrong */
+		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
+		return -ENODEV;
+	}
+
+	if (!VDEV_IS_EXTENDED(info->vdevice)) {
+		minor = BLKIF_MINOR(info->vdevice);
+		nr_parts = PARTS_PER_DISK;
+	} else {
+		minor = BLKIF_MINOR_EXT(info->vdevice);
+		nr_parts = PARTS_PER_EXT_DISK;
+	}
+
+	if ((minor % nr_parts) == 0)
+		nr_minors = nr_parts;
 
 	gd = alloc_disk(nr_minors);
 	if (gd == NULL)
 		goto out;
 
-	if (nr_minors > 1)
-		sprintf(gd->disk_name, "%s%c", DEV_NAME,
-			'a' + minor / PARTS_PER_DISK);
-	else
-		sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
-			'a' + minor / PARTS_PER_DISK,
-			minor % PARTS_PER_DISK);
+	offset = minor / nr_parts;
+
+	if (nr_minors > 1) {
+		if (offset < 26)
+			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
+		else
+			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
+				'a' + ((offset / 26)-1), 'a' + (offset % 26));
+	} else {
+		if (offset < 26)
+			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
+				'a' + offset,
+				minor & (nr_parts - 1));
+		else
+			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
+				'a' + ((offset / 26) - 1),
+				'a' + (offset % 26),
+				minor & (nr_parts - 1));
+	}
 
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
@@ -699,8 +730,13 @@ static int blkfront_probe(struct xenbus_device *dev,
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
 			   "virtual-device", "%i", &vdevice);
 	if (err != 1) {
-		xenbus_dev_fatal(dev, err, "reading virtual-device");
-		return err;
+		/* go looking in the extended area instead */
+		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
+				   "%i", &vdevice);
+		if (err != 1) {
+			xenbus_dev_fatal(dev, err, "reading virtual-device");
+			return err;
+		}
 	}
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
@@ -861,9 +897,7 @@ static void blkfront_connect(struct blkfront_info *info)
 	if (err)
 		info->feature_barrier = 0;
 
-	err = xlvbd_alloc_gendisk(BLKIF_MINOR(info->vdevice),
-				  sectors, info->vdevice,
-				  binfo, sector_size, info);
+	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 				 info->xbdev->otherend);

From e3335de94067dbebe22e3962632ead34e832cb60 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 18 Sep 2008 09:22:54 -0700
Subject: [PATCH 101/132] block: blk_cleanup_queue() should call
 blk_sync_queue()

When a driver calls blk_cleanup_queue(), the device should be fully idle.
However, the block layer may have pending plugging timers and the IO
schedulers may have pending work in the work queues. So quisce the device
by waiting for the timer and flushing the work queues.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index d768a8ddc173..37fba001bdcf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -436,6 +436,14 @@ void blk_put_queue(struct request_queue *q)
 
 void blk_cleanup_queue(struct request_queue *q)
 {
+	/*
+	 * We know we have process context here, so we can be a little
+	 * cautious and ensure that pending block actions on this device
+	 * are done before moving on. Going into this function, we should
+	 * not have processes doing IO to this device.
+	 */
+	blk_sync_queue(q);
+
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);

From 32fab448e5e86694beade415e750363538ea5f49 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 18 Sep 2008 10:45:09 -0400
Subject: [PATCH 102/132] block: add request update interface

This patch adds blk_update_request(), which updates struct request
with completing its data part, but doesn't complete the struct
request itself.
Though it looks like end_that_request_first() of older kernels,
blk_update_request() should be used only by request stacking drivers.

Request-based dm will use it in bio->bi_end_io callback to update
the original request when a data part of a cloned request completes.
Followings are additional background information of why request-based
dm needs this interface.

  - Request stacking drivers can't use blk_end_request() directly from
    the lower driver's completion context (bio->bi_end_io or rq->end_io),
    because some device drivers (e.g. ide) may try to complete
    their request with queue lock held, and it may cause deadlock.
    See below for detailed description of possible deadlock:
    <http://marc.info/?l=linux-kernel&m=120311479108569&w=2>

  - To solve that, request-based dm offloads the completion of
    cloned struct request to softirq context (i.e. using
    blk_complete_request() from rq->end_io).

  - Though it is possible to use the same solution from bio->bi_end_io,
    it will delay the notification of bio completion to the original
    submitter.  Also, it will cause inefficient partial completion,
    because the lower driver can't perform the cloned request anymore
    and request-based dm needs to requeue and redispatch it to
    the lower driver again later.  That's not good.

  - So request-based dm needs blk_update_request() to perform the bio
    completion in the lower driver's completion context, which is more
    efficient.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 57 +++++++++++++++++++++++++++++++++++-------
 include/linux/blkdev.h |  2 ++
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 37fba001bdcf..527d43e982bb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1806,6 +1806,22 @@ void end_request(struct request *req, int uptodate)
 }
 EXPORT_SYMBOL(end_request);
 
+static int end_that_request_data(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
+{
+	if (rq->bio) {
+		if (__end_that_request_first(rq, error, nr_bytes))
+			return 1;
+
+		/* Bidi request must be completed as a whole */
+		if (blk_bidi_rq(rq) &&
+		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
+			return 1;
+	}
+
+	return 0;
+}
+
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
@@ -1832,15 +1848,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
 
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
-
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
+	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
+		return 1;
 
 	/* Special feature for tricky drivers */
 	if (drv_callback && drv_callback(rq))
@@ -1922,6 +1931,36 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
+/**
+ * blk_update_request - Special helper function for request stacking drivers
+ * @rq:           the request being processed
+ * @error:        %0 for success, < %0 for error
+ * @nr_bytes:     number of bytes to complete @rq
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is only for request stacking drivers
+ *     (e.g. request-based dm) so that they can handle partial completion.
+ *     Actual device drivers should use blk_end_request instead.
+ */
+void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	if (!end_that_request_data(rq, error, nr_bytes, 0)) {
+		/*
+		 * These members are not updated in end_that_request_data()
+		 * when all bios are completed.
+		 * Update them so that the request stacking driver can find
+		 * how many bytes remain in the request later.
+		 */
+		rq->nr_sectors = rq->hard_nr_sectors = 0;
+		rq->current_nr_sectors = rq->hard_cur_sectors = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+
 /**
  * blk_end_request_callback - Special helper function for tricky drivers
  * @rq:           the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e23b838825bd..e82a84c9f37a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -791,6 +791,8 @@ extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
+extern void blk_update_request(struct request *rq, int error,
+			       unsigned int nr_bytes);
 
 /*
  * blk_end_request() takes bytes instead of sectors as a complete size.

From 82124d60354846623a4b94af335717a5e142a074 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 18 Sep 2008 10:45:38 -0400
Subject: [PATCH 103/132] block: add request submission interface

This patch adds blk_insert_cloned_request(), a generic request
submission interface for request stacking drivers.
Request-based dm will use it to submit their clones to underlying
devices.

blk_rq_check_limits() is also added because it is possible that
the lower queue has stronger limitations than the upper queue
if multiple drivers are stacking at request-level.
Not only for blk_insert_cloned_request()'s internal use, the function
will be used by request-based dm when the queue limitation is
modified (e.g. by replacing dm's table).

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 81 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  3 ++
 2 files changed, 84 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 527d43e982bb..b8ffbfe85ca4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1529,6 +1529,87 @@ void submit_bio(int rw, struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
+/**
+ * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * @q:  the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ *    @rq may have been made based on weaker limitations of upper-level queues
+ *    in request stacking drivers, and it may violate the limitation of @q.
+ *    Since the block layer and the underlying device driver trust @rq
+ *    after it is inserted to @q, it should be checked against @q before
+ *    the insertion using this generic function.
+ *
+ *    This function should also be useful for request stacking drivers
+ *    in some cases below, so export this fuction.
+ *    Request stacking drivers like request-based dm may change the queue
+ *    limits while requests are in the queue (e.g. dm's table swapping).
+ *    Such request stacking drivers should check those requests agaist
+ *    the new queue limits again when they dispatch those requests,
+ *    although such checkings are also done against the old queue limits
+ *    when submitting requests.
+ */
+int blk_rq_check_limits(struct request_queue *q, struct request *rq)
+{
+	if (rq->nr_sectors > q->max_sectors ||
+	    rq->data_len > q->max_hw_sectors << 9) {
+		printk(KERN_ERR "%s: over max size limit.\n", __func__);
+		return -EIO;
+	}
+
+	/*
+	 * queue's settings related to segment counting like q->bounce_pfn
+	 * may differ from that of other stacking queues.
+	 * Recalculate it to check the request correctly on this queue's
+	 * limitation.
+	 */
+	blk_recalc_rq_segments(rq);
+	if (rq->nr_phys_segments > q->max_phys_segments ||
+	    rq->nr_phys_segments > q->max_hw_segments) {
+		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
+		return -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_rq_check_limits);
+
+/**
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @q:  the queue to submit the request
+ * @rq: the request being queued
+ */
+int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+{
+	unsigned long flags;
+
+	if (blk_rq_check_limits(q, rq))
+		return -EIO;
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
+	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
+		return -EIO;
+#endif
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/*
+	 * Submitting request must be dequeued before calling this function
+	 * because it will be linked to another request_queue
+	 */
+	BUG_ON(blk_queued_rq(rq));
+
+	drive_stat_acct(rq, 1);
+	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e82a84c9f37a..964c246bc271 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -693,6 +693,9 @@ extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
+extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
+extern int blk_insert_cloned_request(struct request_queue *q,
+				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
 extern void blk_plug_device_unlocked(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);

From 4ee5eaf4516a60f8ef64d3c246c64c6be0cf8c3a Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 18 Sep 2008 10:46:13 -0400
Subject: [PATCH 104/132] block: add a queue flag for request stacking support

This patch adds a queue flag to indicate the block device can be
used for request stacking.

Request stacking drivers need to stack their devices on top of
only devices of which q->request_fn is functional.
Since bio stacking drivers (e.g. md, loop) basically initialize
their queue using blk_alloc_queue() and don't set q->request_fn,
the check of (q->request_fn == NULL) looks enough for that purpose.

However, dm will become both types of stacking driver (bio-based and
request-based).  And dm will always set q->request_fn even if the dm
device is bio-based of which q->request_fn is not functional actually.
So we need something else to distinguish the type of the device.
Adding a queue flag is a solution for that.

The reason why dm always sets q->request_fn is to keep
the compatibility of dm user-space tools.
Currently, all dm user-space tools are using bio-based dm without
specifying the type of the dm device they use.
To use request-based dm without changing such tools, the kernel
must decide the type of the dm device automatically.
The automatic type decision can't be done at the device creation time
and needs to be deferred until such tools load a mapping table,
since the actual type is decided by dm target type included in
the mapping table.

So a dm device has to be initialized using blk_init_queue()
so that we can load either type of table.
Then, all queue stuffs are set (e.g. q->request_fn) and we have
no element to distinguish that it is bio-based or request-based,
even after a table is loaded and the type of the device is decided.

By the way, some stuffs of the queue (e.g. request_list, elevator)
are needless when the dm device is used as bio-based.
But the memory size is not so large (about 20[KB] per queue on ia64),
so I hope the memory loss can be acceptable for bio-based dm users.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 3 ++-
 include/linux/blkdev.h | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b8ffbfe85ca4..fa212348c4c9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,7 +574,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
-	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
+	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER |
+				   1 << QUEUE_FLAG_STACKABLE);
 	q->queue_lock		= lock;
 
 	blk_queue_segment_boundary(q, 0xffffffff);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 964c246bc271..86f77ef127f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -441,6 +441,7 @@ struct request_queue
 #define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
 #define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
+#define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -547,6 +548,8 @@ enum {
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
+#define blk_queue_stackable(q)	\
+	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 
 #define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)

From 79eb014578b79fcfb9d9e7dc979d1316079220aa Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Sep 2008 09:35:28 -0700
Subject: [PATCH 105/132] fix an example of scatterlists handling in
 DMA-API.txt

This example isn't the proper way to handle scatterlists (can't handle
sg chaining).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/DMA-API.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d8b63d164e41..b8e86460046e 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -337,7 +337,7 @@ With scatterlists, you use the resulting mapping like this:
 	int i, count = dma_map_sg(dev, sglist, nents, direction);
 	struct scatterlist *sg;
 
-	for (i = 0, sg = sglist; i < count; i++, sg++) {
+	for_each_sg(sglist, sg, count, i) {
 		hw_address[i] = sg_dma_address(sg);
 		hw_len[i] = sg_dma_len(sg);
 	}

From 905bd78f2188da69e74966918e3d71df3dff382b Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Fri, 19 Sep 2008 18:27:47 -0700
Subject: [PATCH 106/132] cciss: Fix cciss SCSI rescan code to better notice
 device changes

Fix cciss SCSI rescan code to better notice device changes.
If you hot-unplug a tape drive, then hot-plug a different
tape drive into the same slot in a storage enclosure,
the cciss driver wouldn't notice anything had changed, as
it was only looking at the LUN address and device type.
Now it looks at the inquiry page 0x83 device identifier,
and vendor and model strings as well.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss_scsi.c | 149 ++++++++++++++++++++++++-------------
 drivers/block/cciss_scsi.h |   4 +
 2 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e1233aabda77..a3fd87b41444 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -365,7 +365,7 @@ struct scsi2map {
 
 static int 
 cciss_scsi_add_entry(int ctlr, int hostno, 
-		unsigned char *scsi3addr, int devtype,
+		struct cciss_scsi_dev_t *device,
 		struct scsi2map *added, int *nadded)
 {
 	/* assumes hba[ctlr]->scsi_ctlr->lock is held */ 
@@ -384,12 +384,12 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 	lun = 0;
 	/* Is this device a non-zero lun of a multi-lun device */
 	/* byte 4 of the 8-byte LUN addr will contain the logical unit no. */
-	if (scsi3addr[4] != 0) {
+	if (device->scsi3addr[4] != 0) {
 		/* Search through our list and find the device which */
 		/* has the same 8 byte LUN address, excepting byte 4. */
 		/* Assign the same bus and target for this new LUN. */
 		/* Use the logical unit number from the firmware. */
-		memcpy(addr1, scsi3addr, 8);
+		memcpy(addr1, device->scsi3addr, 8);
 		addr1[4] = 0;
 		for (i = 0; i < n; i++) {
 			sd = &ccissscsi[ctlr].dev[i];
@@ -399,7 +399,7 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 			if (memcmp(addr1, addr2, 8) == 0) {
 				bus = sd->bus;
 				target = sd->target;
-				lun = scsi3addr[4];
+				lun = device->scsi3addr[4];
 				break;
 			}
 		}
@@ -420,8 +420,12 @@ cciss_scsi_add_entry(int ctlr, int hostno,
 	added[*nadded].lun = sd->lun;
 	(*nadded)++;
 
-	memcpy(&sd->scsi3addr[0], scsi3addr, 8);
-	sd->devtype = devtype;
+	memcpy(sd->scsi3addr, device->scsi3addr, 8);
+	memcpy(sd->vendor, device->vendor, sizeof(sd->vendor));
+	memcpy(sd->revision, device->revision, sizeof(sd->revision));
+	memcpy(sd->device_id, device->device_id, sizeof(sd->device_id));
+	sd->devtype = device->devtype;
+
 	ccissscsi[ctlr].ndevices++;
 
 	/* initially, (before registering with scsi layer) we don't 
@@ -487,6 +491,22 @@ static void fixup_botched_add(int ctlr, char *scsi3addr)
 	CPQ_TAPE_UNLOCK(ctlr, flags);
 }
 
+static int device_is_the_same(struct cciss_scsi_dev_t *dev1,
+	struct cciss_scsi_dev_t *dev2)
+{
+	return dev1->devtype == dev2->devtype &&
+		memcmp(dev1->scsi3addr, dev2->scsi3addr,
+			sizeof(dev1->scsi3addr)) == 0 &&
+		memcmp(dev1->device_id, dev2->device_id,
+			sizeof(dev1->device_id)) == 0 &&
+		memcmp(dev1->vendor, dev2->vendor,
+			sizeof(dev1->vendor)) == 0 &&
+		memcmp(dev1->model, dev2->model,
+			sizeof(dev1->model)) == 0 &&
+		memcmp(dev1->revision, dev2->revision,
+			sizeof(dev1->revision)) == 0;
+}
+
 static int
 adjust_cciss_scsi_table(int ctlr, int hostno,
 	struct cciss_scsi_dev_t sd[], int nsds)
@@ -532,7 +552,7 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 		for (j=0;j<nsds;j++) {
 			if (SCSI3ADDR_EQ(sd[j].scsi3addr,
 				csd->scsi3addr)) {
-				if (sd[j].devtype == csd->devtype)
+				if (device_is_the_same(&sd[j], csd))
 					found=2;
 				else
 					found=1;
@@ -548,22 +568,26 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 			cciss_scsi_remove_entry(ctlr, hostno, i,
 				removed, &nremoved);
 			/* remove ^^^, hence i not incremented */
-		} 
-		else if (found == 1) { /* device is different kind */
+		} else if (found == 1) { /* device is different in some way */
 			changes++;
-			printk("cciss%d: device c%db%dt%dl%d type changed "
-				"(device type now %s).\n",
-				ctlr, hostno, csd->bus, csd->target, csd->lun,
-					scsi_device_type(csd->devtype));
+			printk("cciss%d: device c%db%dt%dl%d has changed.\n",
+				ctlr, hostno, csd->bus, csd->target, csd->lun);
 			cciss_scsi_remove_entry(ctlr, hostno, i,
 				removed, &nremoved);
 			/* remove ^^^, hence i not incremented */
-			if (cciss_scsi_add_entry(ctlr, hostno,
-				&sd[j].scsi3addr[0], sd[j].devtype,
+			if (cciss_scsi_add_entry(ctlr, hostno, &sd[j],
 				added, &nadded) != 0)
 				/* we just removed one, so add can't fail. */
 					BUG();
 			csd->devtype = sd[j].devtype;
+			memcpy(csd->device_id, sd[j].device_id,
+				sizeof(csd->device_id));
+			memcpy(csd->vendor, sd[j].vendor,
+				sizeof(csd->vendor));
+			memcpy(csd->model, sd[j].model,
+				sizeof(csd->model));
+			memcpy(csd->revision, sd[j].revision,
+				sizeof(csd->revision));
 		} else 		/* device is same as it ever was, */
 			i++;	/* so just move along. */
 	}
@@ -577,7 +601,7 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 			csd = &ccissscsi[ctlr].dev[j];
 			if (SCSI3ADDR_EQ(sd[i].scsi3addr,
 				csd->scsi3addr)) {
-				if (sd[i].devtype == csd->devtype)
+				if (device_is_the_same(&sd[i], csd))
 					found=2;	/* found device */
 				else
 					found=1; 	/* found a bug. */
@@ -586,16 +610,14 @@ adjust_cciss_scsi_table(int ctlr, int hostno,
 		}
 		if (!found) {
 			changes++;
-			if (cciss_scsi_add_entry(ctlr, hostno, 
-
-				&sd[i].scsi3addr[0], sd[i].devtype,
+			if (cciss_scsi_add_entry(ctlr, hostno, &sd[i],
 				added, &nadded) != 0)
 				break;
 		} else if (found == 1) {
 			/* should never happen... */
 			changes++;
-			printk("cciss%d: device unexpectedly changed type\n",
-				ctlr);
+			printk(KERN_WARNING "cciss%d: device "
+				"unexpectedly changed\n", ctlr);
 			/* but if it does happen, we just ignore that device */
 		}
 	}
@@ -1012,7 +1034,8 @@ cciss_scsi_interpret_error(CommandList_struct *cp)
 
 static int
 cciss_scsi_do_inquiry(ctlr_info_t *c, unsigned char *scsi3addr, 
-		 unsigned char *buf, unsigned char bufsize)
+	unsigned char page, unsigned char *buf,
+	unsigned char bufsize)
 {
 	int rc;
 	CommandList_struct *cp;
@@ -1032,8 +1055,8 @@ cciss_scsi_do_inquiry(ctlr_info_t *c, unsigned char *scsi3addr,
 	ei = cp->err_info; 
 
 	cdb[0] = CISS_INQUIRY;
-	cdb[1] = 0;
-	cdb[2] = 0;
+	cdb[1] = (page != 0);
+	cdb[2] = page;
 	cdb[3] = 0;
 	cdb[4] = bufsize;
 	cdb[5] = 0;
@@ -1053,6 +1076,25 @@ cciss_scsi_do_inquiry(ctlr_info_t *c, unsigned char *scsi3addr,
 	return rc;	
 }
 
+/* Get the device id from inquiry page 0x83 */
+static int cciss_scsi_get_device_id(ctlr_info_t *c, unsigned char *scsi3addr,
+	unsigned char *device_id, int buflen)
+{
+	int rc;
+	unsigned char *buf;
+
+	if (buflen > 16)
+		buflen = 16;
+	buf = kzalloc(64, GFP_KERNEL);
+	if (!buf)
+		return -1;
+	rc = cciss_scsi_do_inquiry(c, scsi3addr, 0x83, buf, 64);
+	if (rc == 0)
+		memcpy(device_id, &buf[8], buflen);
+	kfree(buf);
+	return rc != 0;
+}
+
 static int
 cciss_scsi_do_report_phys_luns(ctlr_info_t *c, 
 		ReportLunData_struct *buf, int bufsize)
@@ -1142,25 +1184,21 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 	ctlr_info_t *c;
 	__u32 num_luns=0;
 	unsigned char *ch;
-	/* unsigned char found[CCISS_MAX_SCSI_DEVS_PER_HBA]; */
-	struct cciss_scsi_dev_t currentsd[CCISS_MAX_SCSI_DEVS_PER_HBA];
+	struct cciss_scsi_dev_t *currentsd, *this_device;
 	int ncurrent=0;
 	int reportlunsize = sizeof(*ld_buff) + CISS_MAX_PHYS_LUN * 8;
 	int i;
 
 	c = (ctlr_info_t *) hba[cntl_num];	
 	ld_buff = kzalloc(reportlunsize, GFP_KERNEL);
-	if (ld_buff == NULL) {
-		printk(KERN_ERR "cciss: out of memory\n");
-		return;
-	}
 	inq_buff = kmalloc(OBDR_TAPE_INQ_SIZE, GFP_KERNEL);
-        if (inq_buff == NULL) {
-                printk(KERN_ERR "cciss: out of memory\n");
-                kfree(ld_buff);
-                return;
+	currentsd = kzalloc(sizeof(*currentsd) *
+			(CCISS_MAX_SCSI_DEVS_PER_HBA+1), GFP_KERNEL);
+	if (ld_buff == NULL || inq_buff == NULL || currentsd == NULL) {
+		printk(KERN_ERR "cciss: out of memory\n");
+		goto out;
 	}
-
+	this_device = &currentsd[CCISS_MAX_SCSI_DEVS_PER_HBA];
 	if (cciss_scsi_do_report_phys_luns(c, ld_buff, reportlunsize) == 0) {
 		ch = &ld_buff->LUNListLength[0];
 		num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8;
@@ -1179,23 +1217,34 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 
 
 	/* adjust our table of devices */	
-	for(i=0; i<num_luns; i++)
-	{
-		int devtype;
-
+	for (i = 0; i < num_luns; i++) {
 		/* for each physical lun, do an inquiry */
 		if (ld_buff->LUN[i][3] & 0xC0) continue;
 		memset(inq_buff, 0, OBDR_TAPE_INQ_SIZE);
 		memcpy(&scsi3addr[0], &ld_buff->LUN[i][0], 8);
 
-		if (cciss_scsi_do_inquiry(hba[cntl_num], scsi3addr, inq_buff,
-			(unsigned char) OBDR_TAPE_INQ_SIZE) != 0) {
+		if (cciss_scsi_do_inquiry(hba[cntl_num], scsi3addr, 0, inq_buff,
+			(unsigned char) OBDR_TAPE_INQ_SIZE) != 0)
 			/* Inquiry failed (msg printed already) */
-			devtype = 0; /* so we will skip this device. */
-		} else /* what kind of device is this? */
-			devtype = (inq_buff[0] & 0x1f);
+			continue; /* so we will skip this device. */
 
-		switch (devtype)
+		this_device->devtype = (inq_buff[0] & 0x1f);
+		this_device->bus = -1;
+		this_device->target = -1;
+		this_device->lun = -1;
+		memcpy(this_device->scsi3addr, scsi3addr, 8);
+		memcpy(this_device->vendor, &inq_buff[8],
+			sizeof(this_device->vendor));
+		memcpy(this_device->model, &inq_buff[16],
+			sizeof(this_device->model));
+		memcpy(this_device->revision, &inq_buff[32],
+			sizeof(this_device->revision));
+		memset(this_device->device_id, 0,
+			sizeof(this_device->device_id));
+		cciss_scsi_get_device_id(hba[cntl_num], scsi3addr,
+			this_device->device_id, sizeof(this_device->device_id));
+
+		switch (this_device->devtype)
 		{
 		  case 0x05: /* CD-ROM */ {
 
@@ -1220,15 +1269,10 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 			if (ncurrent >= CCISS_MAX_SCSI_DEVS_PER_HBA) {
 				printk(KERN_INFO "cciss%d: %s ignored, "
 					"too many devices.\n", cntl_num,
-					scsi_device_type(devtype));
+					scsi_device_type(this_device->devtype));
 				break;
 			}
-			memcpy(&currentsd[ncurrent].scsi3addr[0], 
-				&scsi3addr[0], 8);
-			currentsd[ncurrent].devtype = devtype;
-			currentsd[ncurrent].bus = -1;
-			currentsd[ncurrent].target = -1;
-			currentsd[ncurrent].lun = -1;
+			currentsd[ncurrent] = *this_device;
 			ncurrent++;
 			break;
 		  default: 
@@ -1240,6 +1284,7 @@ cciss_update_non_disk_devices(int cntl_num, int hostno)
 out:
 	kfree(inq_buff);
 	kfree(ld_buff);
+	kfree(currentsd);
 	return;
 }
 
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
index d9c2c586502f..7b750245ae76 100644
--- a/drivers/block/cciss_scsi.h
+++ b/drivers/block/cciss_scsi.h
@@ -66,6 +66,10 @@ struct cciss_scsi_dev_t {
 	int devtype;
 	int bus, target, lun;		/* as presented to the OS */
 	unsigned char scsi3addr[8];	/* as presented to the HW */
+	unsigned char device_id[16];	/* from inquiry pg. 0x83 */
+	unsigned char vendor[8];	/* bytes 8-15 of inquiry data */
+	unsigned char model[16];	/* bytes 16-31 of inquiry data */
+	unsigned char revision[4];	/* bytes 32-35 of inquiry data */
 };
 
 struct cciss_scsi_hba_t {

From 061837bc8687edc2739ef02f721b7ae0b8076390 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 22 Sep 2008 14:57:16 -0700
Subject: [PATCH 107/132] drivers/block: Use DIV_ROUND_UP

The kernel.h macro DIV_ROUND_UP performs the computation (((n) + (d) - 1) /
(d)) but is perhaps more readable.

An extract of the semantic patch that makes this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@haskernel@
@@

#include <linux/kernel.h>

@depends on haskernel@
expression n,d;
@@

(
- (n + d - 1) / d
+ DIV_ROUND_UP(n,d)
|
- (n + (d - 1)) / d
+ DIV_ROUND_UP(n,d)
)

@depends on haskernel@
expression n,d;
@@

- DIV_ROUND_UP((n),d)
+ DIV_ROUND_UP(n,d)

@depends on haskernel@
expression n,d;
@@

- DIV_ROUND_UP(n,(d))
+ DIV_ROUND_UP(n,d)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Cc: <mike.miller@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c    | 8 ++++----
 drivers/block/cpqarray.c | 2 +-
 drivers/block/floppy.c   | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b73116ef9236..1e1f9153000c 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3460,8 +3460,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	       hba[i]->intr[SIMPLE_MODE_INT], dac ? "" : " not");
 
 	hba[i]->cmd_pool_bits =
-	    kmalloc(((hba[i]->nr_cmds + BITS_PER_LONG -
-		      1) / BITS_PER_LONG) * sizeof(unsigned long), GFP_KERNEL);
+	    kmalloc(DIV_ROUND_UP(hba[i]->nr_cmds, BITS_PER_LONG)
+			* sizeof(unsigned long), GFP_KERNEL);
 	hba[i]->cmd_pool = (CommandList_struct *)
 	    pci_alloc_consistent(hba[i]->pdev,
 		    hba[i]->nr_cmds * sizeof(CommandList_struct),
@@ -3493,8 +3493,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	/* command and error info recs zeroed out before
 	   they are used */
 	memset(hba[i]->cmd_pool_bits, 0,
-	       ((hba[i]->nr_cmds + BITS_PER_LONG -
-		 1) / BITS_PER_LONG) * sizeof(unsigned long));
+	       DIV_ROUND_UP(hba[i]->nr_cmds, BITS_PER_LONG)
+			* sizeof(unsigned long));
 
 	hba[i]->num_luns = 0;
 	hba[i]->highest_lun = -1;
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 09c14341e6e3..3d967525e9a9 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -424,7 +424,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		hba[i]->pci_dev, NR_CMDS * sizeof(cmdlist_t),
 		&(hba[i]->cmd_pool_dhandle));
 	hba[i]->cmd_pool_bits = kcalloc(
-		(NR_CMDS+BITS_PER_LONG-1)/BITS_PER_LONG, sizeof(unsigned long),
+		DIV_ROUND_UP(NR_CMDS, BITS_PER_LONG), sizeof(unsigned long),
 		GFP_KERNEL);
 
 	if (!hba[i]->cmd_pool_bits || !hba[i]->cmd_pool)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 395f8ea7981c..9c0b494f5e87 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -1355,20 +1355,20 @@ static void fdc_specify(void)
 	}
 
 	/* Convert step rate from microseconds to milliseconds and 4 bits */
-	srt = 16 - (DP->srt * scale_dtr / 1000 + NOMINAL_DTR - 1) / NOMINAL_DTR;
+	srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR);
 	if (slow_floppy) {
 		srt = srt / 4;
 	}
 	SUPBOUND(srt, 0xf);
 	INFBOUND(srt, 0);
 
-	hlt = (DP->hlt * scale_dtr / 2 + NOMINAL_DTR - 1) / NOMINAL_DTR;
+	hlt = DIV_ROUND_UP(DP->hlt * scale_dtr / 2, NOMINAL_DTR);
 	if (hlt < 0x01)
 		hlt = 0x01;
 	else if (hlt > 0x7f)
 		hlt = hlt_max_code;
 
-	hut = (DP->hut * scale_dtr / 16 + NOMINAL_DTR - 1) / NOMINAL_DTR;
+	hut = DIV_ROUND_UP(DP->hut * scale_dtr / 16, NOMINAL_DTR);
 	if (hut < 0x1)
 		hut = 0x1;
 	else if (hut > 0xf)
@@ -2385,7 +2385,7 @@ static void rw_interrupt(void)
 
 #ifdef FLOPPY_SANITY_CHECK
 	if (nr_sectors / ssize >
-	    (in_sector_offset + current_count_sectors + ssize - 1) / ssize) {
+	    DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) {
 		DPRINT("long rw: %x instead of %lx\n",
 		       nr_sectors, current_count_sectors);
 		printk("rs=%d s=%d\n", R_SECTOR, SECTOR);

From 9e49184c82e9ec3ab4d45f9ea5a17ccaf43869f0 Mon Sep 17 00:00:00 2001
From: Keith Wansbrough <keith@lochan.org>
Date: Mon, 22 Sep 2008 14:57:17 -0700
Subject: [PATCH 108/132] floppy: support arbitrary first-sector numbers

The current floppy_struct allows floppies to number sectors starting
from 0 or 1.  This patch allows arbitrary first-sector numbers - for
example, 0xC1 for Amstrad CPC disks.

This extends the existing 1-bit field (FD_ZEROBASED, bit 2 of stretch)
to 8 bits (FD_SECTMASK, bits 2 to 9).

Currently 0x00 denotes a first sector number of 1, and 0x01 denotes a
first sector number of 0.  We extend this by interpreting FD_SECTMASK
as the first sector number with the LSB flipped.

Signed-off-by: Keith Wansbrough <keith@lochan.org>
Cc: Alain Knaff <alain@linux.lu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Karel Zak <kzak@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/floppy.c | 23 +++++++++++++++--------
 include/linux/fd.h     |  8 +++++++-
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 9c0b494f5e87..cf64ddf5d839 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -423,8 +423,15 @@ static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
  * 1581's logical side 0 is on physical side 1, whereas the Sharp's logical
  * side 0 is on physical side 0 (but with the misnamed sector IDs).
  * 'stretch' should probably be renamed to something more general, like
- * 'options'.  Other parameters should be self-explanatory (see also
- * setfdprm(8)).
+ * 'options'.
+ *
+ * Bits 2 through 9 of 'stretch' tell the number of the first sector.
+ * The LSB (bit 2) is flipped. For most disks, the first sector
+ * is 1 (represented by 0x00<<2).  For some CP/M and music sampler
+ * disks (such as Ensoniq EPS 16plus) it is 0 (represented as 0x01<<2).
+ * For Amstrad CPC disks it is 0xC1 (represented as 0xC0<<2).
+ *
+ * Other parameters should be self-explanatory (see also setfdprm(8)).
  */
 /*
 	    Size
@@ -2236,9 +2243,9 @@ static void setup_format_params(int track)
 			}
 		}
 	}
-	if (_floppy->stretch & FD_ZEROBASED) {
+	if (_floppy->stretch & FD_SECTBASEMASK) {
 		for (count = 0; count < F_SECT_PER_TRACK; count++)
-			here[count].sect--;
+			here[count].sect += FD_SECTBASE(_floppy) - 1;
 	}
 }
 
@@ -2649,7 +2656,7 @@ static int make_raw_rw_request(void)
 	}
 	HEAD = fsector_t / _floppy->sect;
 
-	if (((_floppy->stretch & (FD_SWAPSIDES | FD_ZEROBASED)) ||
+	if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) ||
 	     TESTF(FD_NEED_TWADDLE)) && fsector_t < _floppy->sect)
 		max_sector = _floppy->sect;
 
@@ -2679,7 +2686,7 @@ static int make_raw_rw_request(void)
 	CODE2SIZE;
 	SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
 	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
-	    ((_floppy->stretch & FD_ZEROBASED) ? 0 : 1);
+	    FD_SECTBASE(_floppy);
 
 	/* tracksize describes the size which can be filled up with sectors
 	 * of size ssize.
@@ -3311,7 +3318,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 	    g->head <= 0 ||
 	    g->track <= 0 || g->track > UDP->tracks >> STRETCH(g) ||
 	    /* check if reserved bits are set */
-	    (g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_ZEROBASED)) != 0)
+	    (g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_SECTBASEMASK)) != 0)
 		return -EINVAL;
 	if (type) {
 		if (!capable(CAP_SYS_ADMIN))
@@ -3356,7 +3363,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		if (DRS->maxblock > user_params[drive].sect ||
 		    DRS->maxtrack ||
 		    ((user_params[drive].sect ^ oldStretch) &
-		     (FD_SWAPSIDES | FD_ZEROBASED)))
+		     (FD_SWAPSIDES | FD_SECTBASEMASK)))
 			invalidate_drive(bdev);
 		else
 			process_fd_request();
diff --git a/include/linux/fd.h b/include/linux/fd.h
index b6bd41d2b460..f5d194af07a8 100644
--- a/include/linux/fd.h
+++ b/include/linux/fd.h
@@ -15,10 +15,16 @@ struct floppy_struct {
 			sect,		/* sectors per track */
 			head,		/* nr of heads */
 			track,		/* nr of tracks */
-			stretch;	/* !=0 means double track steps */
+			stretch;	/* bit 0 !=0 means double track steps */
+					/* bit 1 != 0 means swap sides */
+					/* bits 2..9 give the first sector */
+					/*  number (the LSB is flipped) */
 #define FD_STRETCH 1
 #define FD_SWAPSIDES 2
 #define FD_ZEROBASED 4
+#define FD_SECTBASEMASK 0x3FC
+#define FD_MKSECTBASE(s) (((s) ^ 1) << 2)
+#define FD_SECTBASE(floppy) ((((floppy)->stretch & FD_SECTBASEMASK) >> 2) ^ 1)
 
 	unsigned char	gap,		/* gap1 size */
 

From a68bbddba486020c9c74825ce90c4c1ec463e0e8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 24 Sep 2008 13:03:33 +0200
Subject: [PATCH 109/132] block: add queue flag for SSD/non-rotational devices

We don't want to idle in AS/CFQ if the device doesn't have a seek
penalty. So add a QUEUE_FLAG_NONROT to indicate a non-rotational
device, low level drivers should set this flag upon discovery of
an SSD or similar device type.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c     | 6 ++++++
 block/cfq-iosched.c    | 6 ++++++
 include/linux/blkdev.h | 2 ++
 3 files changed, 14 insertions(+)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 80af9257e64a..4c6fafbba933 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -745,6 +745,12 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
  */
 static int as_can_anticipate(struct as_data *ad, struct request *rq)
 {
+	/*
+	 * SSD device without seek penalty, disable idling
+	 */
+	if (blk_queue_nonrot(ad->q))
+		return 0;
+
 	if (!ad->io_context)
 		/*
 		 * Last request submitted was a write
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 494b6fdcb183..03a5953bb5df 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -878,6 +878,12 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	struct cfq_io_context *cic;
 	unsigned long sl;
 
+	/*
+	 * SSD device without seek penalty, disable idling
+	 */
+	if (blk_queue_nonrot(cfqd->queue))
+		return;
+
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfq_cfqq_slice_new(cfqq));
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 86f77ef127f4..0cf3e619fb21 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -442,6 +442,7 @@ struct request_queue
 #define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
 #define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
+#define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -547,6 +548,7 @@ enum {
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
+#define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)

From 8bff7c6b0f63c7ee9c5e3a076338d74125b8debb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 24 Sep 2008 13:05:10 +0200
Subject: [PATCH 110/132] libata: set queue SSD flag for SSD devices

SSD devices should give an RPM setting of 1 in word 217 of the ID
page. If we see such a device, tell the block layer about it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ata/libata-scsi.c | 4 ++++
 include/linux/ata.h       | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index b9d3ba423cb2..054370700abf 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -977,6 +977,10 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
 
 		blk_queue_dma_drain(q, atapi_drain_needed, buf, ATAPI_MAX_DRAIN);
 	} else {
+		if (ata_id_is_ssd(dev->id))
+			queue_flag_set_unlocked(QUEUE_FLAG_NONROT,
+						sdev->request_queue);
+
 		/* ATA devices must be sector aligned */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_SECT_SIZE - 1);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index 8a12d718c169..c1c8b4a4ba26 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -88,6 +88,7 @@ enum {
 	ATA_ID_DLF		= 128,
 	ATA_ID_CSFO		= 129,
 	ATA_ID_CFA_POWER	= 160,
+	ATA_ID_ROT_SPEED	= 217,
 	ATA_ID_PIO4		= (1 << 1),
 
 	ATA_ID_SERNO_LEN	= 20,
@@ -691,6 +692,11 @@ static inline int ata_id_is_cfa(const u16 *id)
 	return 0;
 }
 
+static inline int ata_id_is_ssd(const u16 *id)
+{
+	return id[ATA_ID_ROT_SPEED] == 0x01;
+}
+
 static inline int ata_drive_40wire(const u16 *dev_id)
 {
 	if (ata_id_is_sata(dev_id))

From f7d7b7a7a3db6526a84ea755c1c54a051e9a52de Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 25 Sep 2008 11:37:50 +0200
Subject: [PATCH 111/132] block: as/cfq ssd idle check update

We really need to know about the hardware tagging support as well,
since if the SSD does not do tagging then we still want to idle.
Otherwise have the same dependent sync IO vs flooding async IO
problem as on rotational media.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c  | 4 +++-
 block/cfq-iosched.c | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 4c6fafbba933..71f0abb219ee 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -745,11 +745,13 @@ static int as_can_break_anticipation(struct as_data *ad, struct request *rq)
  */
 static int as_can_anticipate(struct as_data *ad, struct request *rq)
 {
+#if 0 /* disable for now, we need to check tag level as well */
 	/*
 	 * SSD device without seek penalty, disable idling
 	 */
-	if (blk_queue_nonrot(ad->q))
+	if (blk_queue_nonrot(ad->q)) axman
 		return 0;
+#endif
 
 	if (!ad->io_context)
 		/*
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 03a5953bb5df..6a062eebbd15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -879,9 +879,11 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 	unsigned long sl;
 
 	/*
-	 * SSD device without seek penalty, disable idling
+	 * SSD device without seek penalty, disable idling. But only do so
+	 * for devices that support queuing, otherwise we still have a problem
+	 * with sync vs async workloads.
 	 */
-	if (blk_queue_nonrot(cfqd->queue))
+	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
 		return;
 
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

From e3ba9ae58a5599226e3976b29c8093041ae7c332 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 25 Sep 2008 11:42:41 +0200
Subject: [PATCH 112/132] block: reserve some tags just for sync IO

By only allowing async IO to consume 3/4 ths of the tag depth, we
always have slots free to serve sync IO. This is important to avoid
having writes fill the entire tag queue, thus starving reads.

Original patch and idea from Linus Torvalds <torvalds@linux-foundation.org>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-tag.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/block/blk-tag.c b/block/blk-tag.c
index 8a99688eb1b1..c0d419e84ce7 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -337,6 +337,7 @@ EXPORT_SYMBOL(blk_queue_end_tag);
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
+	unsigned max_depth, offset;
 	int tag;
 
 	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -350,10 +351,19 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	/*
 	 * Protect against shared tag maps, as we may not have exclusive
 	 * access to the tag map.
+	 *
+	 * We reserve a few tags just for sync IO, since we don't want
+	 * to starve sync IO on behalf of flooding async IO.
 	 */
+	max_depth = bqt->max_depth;
+	if (rq_is_sync(rq))
+		offset = 0;
+	else
+		offset = max_depth >> 2;
+
 	do {
-		tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
-		if (tag >= bqt->max_depth)
+		tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
+		if (tag >= max_depth)
 			return 1;
 
 	} while (test_and_set_bit_lock(tag, bqt->tag_map));

From c0ddffa84a7d12da9943a94d04dadbfb1883b904 Mon Sep 17 00:00:00 2001
From: Sven Schuetz <sven@linux.vnet.ibm.com>
Date: Fri, 26 Sep 2008 10:58:02 +0200
Subject: [PATCH 113/132] include blktrace_api.h in headers_install

This header file is of interest for user space programming, i.e.
for tools that process blktrace data.

We would like to use it for a tool on-top of blktrace which processes
data provided by blktrace. For this purpose, it would be helpful
if the blktrace API would make it to /usr/include/linux.

The git tree for the blktrace tools comes with its own copy of this header
file. I didn't manage to replace that copy with the file generated
by the patch below yet. A few more cleanups would be needed.
For example, the blktrace ioctl numbers, which are currently defined in
usr/include/fs.h, might need to be moved. Should be feasible, though.

Signed-off-by: Sven Schuetz <sven@linux.vnet.ibm.com>
Signed-off-by: Martin Peschke <mp3@de.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/Kbuild         |  1 +
 include/linux/blktrace_api.h | 58 ++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index b68ec09399be..31474e89c59a 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -180,6 +180,7 @@ unifdef-y += audit.h
 unifdef-y += auto_fs.h
 unifdef-y += auxvec.h
 unifdef-y += binfmts.h
+unifdef-y += blktrace_api.h
 unifdef-y += capability.h
 unifdef-y += capi.h
 unifdef-y += cciss_ioctl.h
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index dcaf2452ed1f..a2a7d0ca2758 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -1,8 +1,10 @@
 #ifndef BLKTRACE_H
 #define BLKTRACE_H
 
+#ifdef __KERNEL__
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#endif
 
 /*
  * Trace categories
@@ -92,17 +94,17 @@ enum blktrace_notify {
  * The trace itself
  */
 struct blk_io_trace {
-	u32 magic;		/* MAGIC << 8 | version */
-	u32 sequence;		/* event number */
-	u64 time;		/* in microseconds */
-	u64 sector;		/* disk offset */
-	u32 bytes;		/* transfer length */
-	u32 action;		/* what happened */
-	u32 pid;		/* who did it */
-	u32 device;		/* device number */
-	u32 cpu;		/* on what cpu did it happen */
-	u16 error;		/* completion error */
-	u16 pdu_len;		/* length of data after this trace */
+	__u32 magic;		/* MAGIC << 8 | version */
+	__u32 sequence;		/* event number */
+	__u64 time;		/* in microseconds */
+	__u64 sector;		/* disk offset */
+	__u32 bytes;		/* transfer length */
+	__u32 action;		/* what happened */
+	__u32 pid;		/* who did it */
+	__u32 device;		/* device number */
+	__u32 cpu;		/* on what cpu did it happen */
+	__u16 error;		/* completion error */
+	__u16 pdu_len;		/* length of data after this trace */
 };
 
 /*
@@ -120,6 +122,25 @@ enum {
 	Blktrace_stopped,
 };
 
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+#ifdef __KERNEL__
+	char name[BDEVNAME_SIZE];	/* output */
+#else
+	char name[32];			/* output */
+#endif
+	__u16 act_mask;			/* input */
+	__u32 buf_size;			/* input */
+	__u32 buf_nr;			/* input */
+	__u64 start_lba;
+	__u64 end_lba;
+	__u32 pid;
+};
+
+#ifdef __KERNEL__
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
 struct blk_trace {
 	int trace_state;
 	struct rchan *rchan;
@@ -136,21 +157,6 @@ struct blk_trace {
 	atomic_t dropped;
 };
 
-/*
- * User setup structure passed with BLKTRACESTART
- */
-struct blk_user_trace_setup {
-	char name[BDEVNAME_SIZE];	/* output */
-	u16 act_mask;			/* input */
-	u32 buf_size;			/* input */
-	u32 buf_nr;			/* input */
-	u64 start_lba;
-	u64 end_lba;
-	u32 pid;
-};
-
-#ifdef __KERNEL__
-#if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);

From 336c3d8ce771608815b65bcfa27a17a83b297328 Mon Sep 17 00:00:00 2001
From: Elias Oltmanns <eo@nebensachen.de>
Date: Wed, 1 Oct 2008 16:02:33 +0200
Subject: [PATCH 114/132] block: Fix blk_start_queueing() to not kick a stopped
 queue

blk_start_queueing() should act like the generic queue unplugging
and kicking and ignore a stopped queue. Such a queue may not be
run until after a call to blk_start_queue().

Signed-off-by: Elias Oltmanns <eo@nebensachen.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fa212348c4c9..c66333d8e48d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -890,9 +890,11 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_start_queueing(struct request_queue *q)
 {
-	if (!blk_queue_plugged(q))
+	if (!blk_queue_plugged(q)) {
+		if (unlikely(blk_queue_stopped(q)))
+			return;
 		q->request_fn(q);
-	else
+	} else
 		__generic_unplug_device(q);
 }
 EXPORT_SYMBOL(blk_start_queueing);

From ef9e3facdf1fe1228721a7c295a76d1b7a0e57ec Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 16:12:15 +0200
Subject: [PATCH 115/132] block: add lld busy state exporting interface

This patch adds an new interface, blk_lld_busy(), to check lld's
busy state from the block layer.
blk_lld_busy() calls down into low-level drivers for the checking
if the drivers set q->lld_busy_fn() using blk_queue_lld_busy().

This resolves a performance problem on request stacking devices below.

Some drivers like scsi mid layer stop dispatching request when
they detect busy state on its low-level device like host/target/device.
It allows other requests to stay in the I/O scheduler's queue
for a chance of merging.

Request stacking drivers like request-based dm should follow
the same logic.
However, there is no generic interface for the stacked device
to check if the underlying device(s) are busy.
If the request stacking driver dispatches and submits requests to
the busy underlying device, the requests will stay in
the underlying device's queue without a chance of merging.
This causes performance problem on burst I/O load.

With this patch, busy state of the underlying device is exported
via q->lld_busy_fn().  So the request stacking driver can check it
and stop dispatching requests if busy.

The underlying device driver must return the busy state appropriately:
    1: when the device driver can't process requests immediately.
    0: when the device driver can process requests immediately,
       including abnormal situations where the device driver needs
       to kill all requests.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 28 ++++++++++++++++++++++++++++
 block/blk-settings.c   |  6 ++++++
 include/linux/blkdev.h |  4 ++++
 3 files changed, 38 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index c66333d8e48d..b2d0ac8b760e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2100,6 +2100,34 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
+/**
+ * blk_lld_busy - Check if underlying low-level drivers of a device are busy
+ * @q : the queue of the device being checked
+ *
+ * Description:
+ *    Check if underlying low-level drivers of a device are busy.
+ *    If the drivers want to export their busy state, they must set own
+ *    exporting function using blk_queue_lld_busy() first.
+ *
+ *    Basically, this function is used only by request stacking drivers
+ *    to stop dispatching requests to underlying devices when underlying
+ *    devices are busy.  This behavior helps more I/O merging on the queue
+ *    of the request stacking driver and prevents I/O throughput regression
+ *    on burst I/O load.
+ *
+ * Return:
+ *    0 - Not busy (The request stacking driver should dispatch request)
+ *    1 - Busy (The request stacking driver should stop dispatching request)
+ */
+int blk_lld_busy(struct request_queue *q)
+{
+	if (q->lld_busy_fn)
+		return q->lld_busy_fn(q);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_lld_busy);
+
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1d0330d0b40a..b21dcdb64151 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -89,6 +89,12 @@ void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
 
+void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
+{
+	q->lld_busy_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0cf3e619fb21..9e0ee1a8254e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -269,6 +269,7 @@ typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
+typedef int (lld_busy_fn) (struct request_queue *q);
 
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
@@ -325,6 +326,7 @@ struct request_queue
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
+	lld_busy_fn		*lld_busy_fn;
 
 	/*
 	 * Dispatch queue sorting
@@ -699,6 +701,7 @@ extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
+extern int blk_lld_busy(struct request_queue *q);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
@@ -835,6 +838,7 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
+extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);

From 0497b345e7d067109e0dd9bf9f4978a6847ee13b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 1 Oct 2008 16:16:25 +0200
Subject: [PATCH 116/132] blktrace: use BLKTRACE_BDEV_SIZE as the name size for
 setup structure

Define as 32, which is is what BDEVNAME_SIZE is/was as well. This keeps
the user interface the same and gets rid of the difference between
kernel and user api here.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 3 ++-
 include/linux/blktrace_api.h | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/blktrace.c b/block/blktrace.c
index 9e0212c90b29..85049a7e7a17 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -369,7 +369,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
 
-	strcpy(buts->name, name);
+	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
 
 	/*
 	 * some device names have larger paths - convert the slashes
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index a2a7d0ca2758..3a31eb506164 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -122,15 +122,13 @@ enum {
 	Blktrace_stopped,
 };
 
+#define BLKTRACE_BDEV_SIZE	32
+
 /*
  * User setup structure passed with BLKTRACESTART
  */
 struct blk_user_trace_setup {
-#ifdef __KERNEL__
-	char name[BDEVNAME_SIZE];	/* output */
-#else
-	char name[32];			/* output */
-#endif
+	char name[BLKTRACE_BDEV_SIZE];	/* output */
 	__u16 act_mask;			/* input */
 	__u32 buf_size;			/* input */
 	__u32 buf_nr;			/* input */

From 8316982ac06d7d8875dc8738efbb030791dc33bb Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 10:11:20 -0400
Subject: [PATCH 117/132] virtio_blk: change to use __blk_end_request()

This patch converts virtio_blk to use __blk_end_request() directly
so that end_{queued|dequeued}_request() can be removed.
Related 'uptodate' argument is converted to 'error'.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 879506a2c234..6ec5fc052786 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -47,20 +47,20 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
-		int uptodate;
+		int error;
 		switch (vbr->status) {
 		case VIRTIO_BLK_S_OK:
-			uptodate = 1;
+			error = 0;
 			break;
 		case VIRTIO_BLK_S_UNSUPP:
-			uptodate = -ENOTTY;
+			error = -ENOTTY;
 			break;
 		default:
-			uptodate = 0;
+			error = -EIO;
 			break;
 		}
 
-		end_dequeued_request(vbr->req, uptodate);
+		__blk_end_request(vbr->req, error, blk_rq_bytes(vbr->req));
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}

From 2a9df5055a99df25533daf4041fdb99f0ed3463c Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 10:12:15 -0400
Subject: [PATCH 118/132] memstick: change to use __blk_end_request()

This patch converts memstick to use __blk_end_request() directly
so that end_{queued|dequeued}_request() can be removed.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alex Dubov <oakad@yahoo.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/memstick/core/mspro_block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 82bf649ef138..6e291bf8237a 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -828,7 +828,7 @@ static void mspro_block_submit_req(struct request_queue *q)
 
 	if (msb->eject) {
 		while ((req = elv_next_request(q)) != NULL)
-			end_queued_request(req, -ENODEV);
+			__blk_end_request(req, -ENODEV, blk_rq_bytes(req));
 
 		return;
 	}

From 7afb3a6e752503d5ebeb038336aa0fa886a51b44 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 10:13:02 -0400
Subject: [PATCH 119/132] gdrom: change to use __blk_end_request()

This patch converts gdrom to use __blk_end_request() directly
so that end_{queued|dequeued}_request() can be removed.

gd.transfer is '1' in error cases and '0' in non-error cases,
so gdrom hasn't been propagating any error code to the block layer.
We can just convert error cases to '-EIO'.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/cdrom/gdrom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 1231d95aa695..d6ba77a2dd7b 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -624,14 +624,14 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 		ctrl_outb(1, GDROM_DMA_STATUS_REG);
 		wait_event_interruptible_timeout(request_queue,
 			gd.transfer == 0, GDROM_DEFAULT_TIMEOUT);
-		err = gd.transfer;
+		err = gd.transfer ? -EIO : 0;
 		gd.transfer = 0;
 		gd.pending = 0;
 		/* now seek to take the request spinlock
 		* before handling ending the request */
 		spin_lock(&gdrom_lock);
 		list_del_init(&req->queuelist);
-		end_dequeued_request(req, 1 - err);
+		__blk_end_request(req, err, blk_rq_bytes(req));
 	}
 	spin_unlock(&gdrom_lock);
 	kfree(read_command);

From 99cd3386f290eaf61f2b7596d5a4cc2007771174 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 10:13:44 -0400
Subject: [PATCH 120/132] block: change elevator to use __blk_end_request()

This patch converts elevator to use __blk_end_request() directly
so that end_{queued|dequeued}_request() can be removed.
Related 'uptodate' arguments is converted to 'error'.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 8a74eedc3530..04518921db31 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -754,7 +754,7 @@ struct request *elv_next_request(struct request_queue *q)
 		 * not ever see it.
 		 */
 		if (blk_empty_barrier(rq)) {
-			end_queued_request(rq, 1);
+			__blk_end_request(rq, 0, blk_rq_bytes(rq));
 			continue;
 		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
@@ -825,7 +825,7 @@ struct request *elv_next_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
-			end_queued_request(rq, 0);
+			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
@@ -922,7 +922,7 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		blk_add_trace_rq(q, rq, BLK_TA_ABORT);
-		end_queued_request(rq, 0);
+		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 	}
 }
 EXPORT_SYMBOL(elv_abort_queue);

From d00e29fd99dd63d1c51917604e35dee824ed567f Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 1 Oct 2008 10:14:46 -0400
Subject: [PATCH 121/132] block: remove end_{queued|dequeued}_request()

This patch removes end_queued_request() and end_dequeued_request(),
which are no longer used.

As a results, users of __end_request() became only end_request().
So the actual code in __end_request() is moved to end_request()
and __end_request() is removed.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 58 +++++-------------------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 7 insertions(+), 53 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b2d0ac8b760e..2d053b584410 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1790,17 +1790,6 @@ static void end_that_request_last(struct request *req, int error)
 	}
 }
 
-static inline void __end_request(struct request *rq, int uptodate,
-				 unsigned int nr_bytes)
-{
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_request(rq, error, nr_bytes);
-}
-
 /**
  * blk_rq_bytes - Returns bytes left to complete in the entire request
  * @rq: the request being processed
@@ -1830,41 +1819,6 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 
-/**
- * end_queued_request - end all I/O on a queued request
- * @rq:		the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request, and removes it from the block layer queues.
- *     Not suitable for normal I/O completion, unless the driver still has
- *     the request attached to the block layer.
- *
- **/
-void end_queued_request(struct request *rq, int uptodate)
-{
-	__end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_queued_request);
-
-/**
- * end_dequeued_request - end all I/O on a dequeued request
- * @rq:		the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends all I/O on a request. The request must already have been
- *     dequeued using blkdev_dequeue_request(), as is normally the case
- *     for most drivers.
- *
- **/
-void end_dequeued_request(struct request *rq, int uptodate)
-{
-	__end_request(rq, uptodate, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(end_dequeued_request);
-
-
 /**
  * end_request - end I/O on the current segment of the request
  * @req:	the request being processed
@@ -1879,14 +1833,16 @@ EXPORT_SYMBOL(end_dequeued_request);
  *     they have a residual value to account for. For that case this function
  *     isn't really useful, unless the residual just happens to be the
  *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end partial parts
- *     of a request, or end_dequeued_request() and end_queued_request() to
- *     completely end IO on a dequeued/queued request.
- *
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
  **/
 void end_request(struct request *req, int uptodate)
 {
-	__end_request(req, uptodate, req->hard_cur_sectors << 9);
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_request(req, error, req->hard_cur_sectors << 9);
 }
 EXPORT_SYMBOL(end_request);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e0ee1a8254e..bfc18e497c7f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -793,8 +793,6 @@ extern int __blk_end_request(struct request *rq, int error,
 extern int blk_end_bidi_request(struct request *rq, int error,
 				unsigned int nr_bytes, unsigned int bidi_bytes);
 extern void end_request(struct request *, int);
-extern void end_queued_request(struct request *, int);
-extern void end_dequeued_request(struct request *, int);
 extern int blk_end_request_callback(struct request *rq, int error,
 				unsigned int nr_bytes,
 				int (drv_callback)(struct request *));

From 8deaf7210728c453295dc1cb2a5b66c68183ac85 Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Thu, 2 Oct 2008 12:46:53 +0200
Subject: [PATCH 122/132] bio.h: Remove unused conditional code

The whole bio_integrity() definition is inside an #ifdef
CONFIG_BLK_DEV_INTEGRITY, there's no need for the conditional code.

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6520ee1a3f6d..98c2d0570657 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -457,14 +457,7 @@ static inline int bio_has_data(struct bio *bio)
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 
-static inline int bio_integrity(struct bio *bio)
-{
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	return bio->bi_integrity != NULL;
-#else
-	return 0;
-#endif
-}
+#define bio_integrity(bio) (bio->bi_integrity != NULL)
 
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);

From b04accc425d52ca59699290661e0dfd09b0feeeb Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 2 Oct 2008 12:53:22 +0200
Subject: [PATCH 123/132] block: revert part of
 d7533ad0e132f92e75c1b2eb7c26387b25a583c1

We need bdev_get_integrity() to support the pending md/dm patches.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c     | 5 -----
 include/linux/blkdev.h | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index ba4ada08564a..6e28dcdd23a2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -150,11 +150,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
-static struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
-{
-	return bdev->bd_disk->integrity;
-}
-
 static int bdev_integrity_enabled(struct block_device *bdev, int rw)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bdev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bfc18e497c7f..bc693f5c3886 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1016,6 +1016,12 @@ extern int blk_integrity_compare(struct block_device *, struct block_device *);
 extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 extern int blk_rq_count_integrity_sg(struct request *);
 
+static inline
+struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
+{
+	return bdev->bd_disk->integrity;
+}
+
 static inline int blk_integrity_rq(struct request *rq)
 {
 	if (rq->bio == NULL)
@@ -1029,6 +1035,7 @@ static inline int blk_integrity_rq(struct request *rq)
 #define blk_integrity_rq(rq)			(0)
 #define blk_rq_count_integrity_sg(a)		(0)
 #define blk_rq_map_integrity_sg(a, b)		(0)
+#define bdev_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);

From 74aa8c2cc010035a7eef2b4ca4d6430e0dae206a Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 03:38:37 -0400
Subject: [PATCH 124/132] block: Introduce integrity data ownership flag

A filesystem might supply its own integrity metadata.  Introduce a
flag that indicates whether the filesystem or the block layer owns the
integrity buffer.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c  | 3 ++-
 include/linux/bio.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 6e28dcdd23a2..19caf7c962ac 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	BUG_ON(bip == NULL);
 
 	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
 	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 98c2d0570657..d86d39d490e6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -109,6 +109,7 @@ struct bio {
 #define BIO_EOPNOTSUPP	7	/* not supported */
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
+#define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*

From 0c032ab889e7b20b8a5a7d09313e4aca214a15f7 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 03:38:38 -0400
Subject: [PATCH 125/132] block: Fix double put in blk_integrity_unregister

- kobject_del already puts the parent.

 - Set integrity profile to NULL to prevent stale data.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 69023da63151..e3817a016a12 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -376,7 +376,7 @@ void blk_integrity_unregister(struct gendisk *disk)
 
 	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
 	kobject_del(&bi->kobj);
-	kobject_put(&disk_to_dev(disk)->kobj);
 	kmem_cache_free(integrity_cachep, bi);
+	disk->integrity = NULL;
 }
 EXPORT_SYMBOL(blk_integrity_unregister);

From ad7fce93147d32ae53d25d9ea1a8ba31a239deee Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 03:38:39 -0400
Subject: [PATCH 126/132] block: Switch blk_integrity_compare from bdev to
 gendisk

The DM and MD integrity support now depends on being able to use
gendisks instead of block_devices when comparing integrity profiles.
Change function parameters accordingly.

Also update comparison logic so that two NULL profiles are a valid
configuration.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c  | 28 ++++++++++++++--------------
 include/linux/blkdev.h |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index e3817a016a12..61a8e2f8fdd0 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -108,51 +108,51 @@ new_segment:
 EXPORT_SYMBOL(blk_rq_map_integrity_sg);
 
 /**
- * blk_integrity_compare - Compare integrity profile of two block devices
- * @bd1:	Device to compare
- * @bd2:	Device to compare
+ * blk_integrity_compare - Compare integrity profile of two disks
+ * @gd1:	Disk to compare
+ * @gd2:	Disk to compare
  *
  * Description: Meta-devices like DM and MD need to verify that all
  * sub-devices use the same integrity format before advertising to
  * upper layers that they can send/receive integrity metadata.  This
- * function can be used to check whether two block devices have
+ * function can be used to check whether two gendisk devices have
  * compatible integrity formats.
  */
-int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
+int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
 {
-	struct blk_integrity *b1 = bd1->bd_disk->integrity;
-	struct blk_integrity *b2 = bd2->bd_disk->integrity;
+	struct blk_integrity *b1 = gd1->integrity;
+	struct blk_integrity *b2 = gd2->integrity;
 
-	BUG_ON(bd1->bd_disk == NULL);
-	BUG_ON(bd2->bd_disk == NULL);
+	if (!b1 && !b2)
+		return 0;
 
 	if (!b1 || !b2)
-		return 0;
+		return -1;
 
 	if (b1->sector_size != b2->sector_size) {
 		printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
-		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       gd1->disk_name, gd2->disk_name,
 		       b1->sector_size, b2->sector_size);
 		return -1;
 	}
 
 	if (b1->tuple_size != b2->tuple_size) {
 		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
-		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       gd1->disk_name, gd2->disk_name,
 		       b1->tuple_size, b2->tuple_size);
 		return -1;
 	}
 
 	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
 		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
-		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       gd1->disk_name, gd2->disk_name,
 		       b1->tag_size, b2->tag_size);
 		return -1;
 	}
 
 	if (strcmp(b1->name, b2->name)) {
 		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
-		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       gd1->disk_name, gd2->disk_name,
 		       b1->name, b2->name);
 		return -1;
 	}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc693f5c3886..00d340b0f758 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1012,7 +1012,7 @@ struct blk_integrity {
 
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
-extern int blk_integrity_compare(struct block_device *, struct block_device *);
+extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
 extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 extern int blk_rq_count_integrity_sg(struct request *);
 

From b02739b01c5309d74a59859f2ce92c931d1f1955 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 2 Oct 2008 18:47:49 +0200
Subject: [PATCH 127/132] block: gendisk integrity wrapper

This is a wrapper for accessing a gendisk's integrity bits.  It allows
the integrity support in MD to be compiled with BLK_DEV_INTEGRITY off.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 00d340b0f758..a92d9e4ea96e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1022,6 +1022,11 @@ struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
 	return bdev->bd_disk->integrity;
 }
 
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+	return disk->integrity;
+}
+
 static inline int blk_integrity_rq(struct request *rq)
 {
 	if (rq->bio == NULL)
@@ -1036,6 +1041,7 @@ static inline int blk_integrity_rq(struct request *rq)
 #define blk_rq_count_integrity_sg(a)		(0)
 #define blk_rq_map_integrity_sg(a, b)		(0)
 #define bdev_get_integrity(a)			(0)
+#define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);

From ad3316bf4eeb53c89164f759767f911072b56203 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 1 Oct 2008 22:42:53 -0400
Subject: [PATCH 128/132] block: Find bio sector offset given idx and offset

Helper function to find the sector offset in a bio given bvec index
and page offset.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  1 +
 2 files changed, 37 insertions(+)

diff --git a/fs/bio.c b/fs/bio.c
index e56e7685af9c..a5af5809f566 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1300,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
+/**
+ *      bio_sector_offset - Find hardware sector offset in bio
+ *      @bio:           bio to inspect
+ *      @index:         bio_vec index
+ *      @offset:        offset in bv_page
+ *
+ *      Return the number of hardware sectors between beginning of bio
+ *      and an end point indicated by a bio_vec index and an offset
+ *      within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+			   unsigned int offset)
+{
+	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	struct bio_vec *bv;
+	sector_t sectors;
+	int i;
+
+	sectors = 0;
+
+	if (index >= bio->bi_idx)
+		index = bio->bi_vcnt - 1;
+
+	__bio_for_each_segment(bv, bio, i, 0) {
+		if (i == index) {
+			if (offset > bv->bv_offset)
+				sectors += (offset - bv->bv_offset) / sector_sz;
+			break;
+		}
+
+		sectors += bv->bv_len / sector_sz;
+	}
+
+	return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
 
 /*
  * create memory pools for biovec's in a bio_set.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d86d39d490e6..fe12d0f9ebaa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -327,6 +327,7 @@ extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
+extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int, gfp_t);
 struct sg_iovec;

From 6feef531f55cf4a20fd9eb39f5352e5745203603 Mon Sep 17 00:00:00 2001
From: Denis ChengRq <crquan@gmail.com>
Date: Thu, 9 Oct 2008 08:57:05 +0200
Subject: [PATCH 129/132] block: mark bio_split_pool static

Since all bio_split calls refer the same single bio_split_pool, the bio_split
function can use bio_split_pool directly instead of the mempool_t parameter;

then the mempool_t parameter can be removed from bio_split param list, and
bio_split_pool is only referred in fs/bio.c file, can be marked static.

Signed-off-by: Denis ChengRq <crquan@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/pktcdvd.c | 2 +-
 drivers/md/linear.c     | 2 +-
 drivers/md/raid0.c      | 2 +-
 drivers/md/raid10.c     | 2 +-
 fs/bio.c                | 9 ++++-----
 include/linux/bio.h     | 4 +---
 6 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index e1a90bbb4747..0e077150568b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2544,7 +2544,7 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio)
 		if (last_zone != zone) {
 			BUG_ON(last_zone != zone + pd->settings.size);
 			first_sectors = last_zone - bio->bi_sector;
-			bp = bio_split(bio, bio_split_pool, first_sectors);
+			bp = bio_split(bio, first_sectors);
 			BUG_ON(!bp);
 			pkt_make_request(q, &bp->bio1);
 			pkt_make_request(q, &bp->bio2);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index c80ea90593d3..b9cbee688fae 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -353,7 +353,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 		 * split it.
 		 */
 		struct bio_pair *bp;
-		bp = bio_split(bio, bio_split_pool,
+		bp = bio_split(bio,
 			       ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector);
 		if (linear_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f52f442a735f..53508a8a981d 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -427,7 +427,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
+		bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)));
 		if (raid0_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
 		if (raid0_make_request(q, &bp->bio2))
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5f990133f5ef..8bdc9bfc2887 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -817,7 +817,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio, bio_split_pool,
+		bp = bio_split(bio,
 			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
 		if (make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
diff --git a/fs/bio.c b/fs/bio.c
index a5af5809f566..77a55bcceedb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -30,7 +30,7 @@
 
 static struct kmem_cache *bio_slab __read_mostly;
 
-mempool_t *bio_split_pool __read_mostly;
+static mempool_t *bio_split_pool __read_mostly;
 
 /*
  * if you change this list, also change bvec_alloc or things will
@@ -1256,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
  * split a bio - only worry about a bio with a single page
  * in it's iovec
  */
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 {
-	struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+	struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
 	if (!bp)
 		return bp;
@@ -1292,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio2.bi_end_io = bio_pair_end_2;
 
 	bp->bio1.bi_private = bi;
-	bp->bio2.bi_private = pool;
+	bp->bio2.bi_private = bio_split_pool;
 
 	if (bio_integrity(bi))
 		bio_integrity_split(bi, bp, first_sectors);
@@ -1455,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_copy_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
 EXPORT_SYMBOL(bio_copy_user);
 EXPORT_SYMBOL(bio_uncopy_user);
 EXPORT_SYMBOL(bioset_create);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fe12d0f9ebaa..fb97221d7c30 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -300,9 +300,7 @@ struct bio_pair {
 	atomic_t			cnt;
 	int				error;
 };
-extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
-				  int first_sectors);
-extern mempool_t *bio_split_pool;
+extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 
 extern struct bio_set *bioset_create(int, int);

From af5639424008ffe96f89b059bea1aec15e0115a9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 9 Oct 2008 09:01:10 +0200
Subject: [PATCH 130/132] block: add some comments around the bio read-write
 flags

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index fb97221d7c30..ff5b4cf9e2da 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -123,13 +123,23 @@ struct bio {
 /*
  * bio bi_rw flags
  *
- * bit 0 -- read (not set) or write (set)
+ * bit 0 -- data direction
+ *	If not set, bio is a read from device. If set, it's a write to device.
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
+ *	Insert a serialization point in the IO queue, forcing previously
+ *	submitted IO to be completed before this oen is issued.
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
+ *	Note that this does NOT indicate that the IO itself is sync, just
+ *	that the block layer will not postpone issue of this IO by plugging.
  * bit 5 -- metadata request
+ *	Used for tracing to differentiate metadata and data IO. May also
+ *	get some preferential treatment in the IO scheduler
  * bit 6 -- discard sectors
+ *	Informs the lower level device that this range of sectors is no longer
+ *	used by the file system and may thus be freed by the device. Used
+ *	for flash based storage.
  */
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */

From 57d1b5366f46fe434e565b710baf683daff78dd8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 9 Oct 2008 10:42:38 +0200
Subject: [PATCH 131/132] block_dev: fix kernel-doc in new functions

Fix kernel-doc in new functions:

Error(mmotm-2008-1002-1617//fs/block_dev.c:895): duplicate section name 'Description'
Error(mmotm-2008-1002-1617//fs/block_dev.c:924): duplicate section name 'Description'
Warning(mmotm-2008-1002-1617//fs/block_dev.c:1282): No description found for parameter 'pathname'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Andrew Patterson <andrew.patterson@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 57e2786dd2a5..d84f0469a016 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -879,9 +879,7 @@ static void flush_disk(struct block_device *bdev)
 }
 
 /**
- * check_disk_size_change - checks for disk size change and adjusts
- *                          bdev size.
- *
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
  * @disk: struct gendisk to check
  * @bdev: struct bdev to adjust.
  *
@@ -908,9 +906,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 EXPORT_SYMBOL(check_disk_size_change);
 
 /**
- * revalidate_disk - wrapper for lower-level driver's revalidate_disk
- *                   call-back
- *
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
  * @disk: struct gendisk to be revalidated
  *
  * This routine is a wrapper for lower-level driver's revalidate_disk
@@ -1266,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
+ * @pathname:	special file representing the block device
  *
- * @path:	special file representing the block device
- *
- * Get a reference to the blockdevice at @path in the current
+ * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
  * otherwise.
  */

From b911e473d24633c19414b54b82b9ff0b1a2419d7 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 10 Oct 2008 08:22:44 +0200
Subject: [PATCH 132/132] doc/cdrom: Trvial documentation error, file not
 present

The sbpcd tester program is not included in the kernel source tree,
so remove the reference to it.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Reported-by: Nick Warne <nick@ukfsn.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/cdrom/ide-cd | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Documentation/cdrom/ide-cd b/Documentation/cdrom/ide-cd
index 91c0dcc6fa5c..2c558cd6c1ef 100644
--- a/Documentation/cdrom/ide-cd
+++ b/Documentation/cdrom/ide-cd
@@ -145,8 +145,7 @@ useful for reading photocds.
 
 To play an audio CD, you should first unmount and remove any data
 CDROM.  Any of the CDROM player programs should then work (workman,
-workbone, cdplayer, etc.).  Lacking anything else, you could use the
-cdtester program in Documentation/cdrom/sbpcd.
+workbone, cdplayer, etc.).
 
 On a few drives, you can read digital audio directly using a program
 such as cdda2wav.  The only types of drive which I've heard support