Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm

* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (44 commits)
  dm raid1: report fault status
  dm raid1: handle read failures
  dm raid1: fix EIO after log failure
  dm raid1: handle recovery failures
  dm raid1: handle write failures
  dm snapshot: combine consecutive exceptions in memory
  dm: stripe enhanced status return
  dm: stripe trigger event on failure
  dm log: auto load modules
  dm: move deferred bio flushing to workqueue
  dm crypt: use async crypto
  dm crypt: prepare async callback fn
  dm crypt: add completion for async
  dm crypt: add async request mempool
  dm crypt: extract scatterlist processing
  dm crypt: tidy io ref counting
  dm crypt: introduce crypt_write_io_loop
  dm crypt: abstract crypt_write_done
  dm crypt: store sector mapping in dm_crypt_io
  dm crypt: move queue functions
  ...
This commit is contained in:
Linus Torvalds 2008-02-07 19:30:50 -08:00
commit a4ffc0a0b2
15 changed files with 1378 additions and 489 deletions

View File

@ -204,7 +204,7 @@ config BLK_DEV_DM
config DM_DEBUG config DM_DEBUG
boolean "Device mapper debugging support" boolean "Device mapper debugging support"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
---help--- ---help---
Enable this for messages that may help debug device-mapper problems. Enable this for messages that may help debug device-mapper problems.
@ -212,7 +212,7 @@ config DM_DEBUG
config DM_CRYPT config DM_CRYPT
tristate "Crypt target support" tristate "Crypt target support"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
select CRYPTO select CRYPTO
select CRYPTO_CBC select CRYPTO_CBC
---help--- ---help---
@ -230,34 +230,34 @@ config DM_CRYPT
If unsure, say N. If unsure, say N.
config DM_SNAPSHOT config DM_SNAPSHOT
tristate "Snapshot target (EXPERIMENTAL)" tristate "Snapshot target"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
---help--- ---help---
Allow volume managers to take writable snapshots of a device. Allow volume managers to take writable snapshots of a device.
config DM_MIRROR config DM_MIRROR
tristate "Mirror target (EXPERIMENTAL)" tristate "Mirror target"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
---help--- ---help---
Allow volume managers to mirror logical volumes, also Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'. needed for live data migration tools such as 'pvmove'.
config DM_ZERO config DM_ZERO
tristate "Zero target (EXPERIMENTAL)" tristate "Zero target"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
---help--- ---help---
A target that discards writes, and returns all zeroes for A target that discards writes, and returns all zeroes for
reads. Useful in some recovery situations. reads. Useful in some recovery situations.
config DM_MULTIPATH config DM_MULTIPATH
tristate "Multipath target (EXPERIMENTAL)" tristate "Multipath target"
depends on BLK_DEV_DM && EXPERIMENTAL depends on BLK_DEV_DM
---help--- ---help---
Allow volume managers to support multipath hardware. Allow volume managers to support multipath hardware.
config DM_MULTIPATH_EMC config DM_MULTIPATH_EMC
tristate "EMC CX/AX multipath support (EXPERIMENTAL)" tristate "EMC CX/AX multipath support"
depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL depends on DM_MULTIPATH && BLK_DEV_DM
---help--- ---help---
Multipath support for EMC CX/AX series hardware. Multipath support for EMC CX/AX series hardware.

View File

@ -1,11 +1,12 @@
/* /*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006 Red Hat, Inc. All rights reserved. * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
* *
* This file is released under the GPL. * This file is released under the GPL.
*/ */
#include <linux/completion.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/init.h> #include <linux/init.h>
@ -27,21 +28,11 @@
#define DM_MSG_PREFIX "crypt" #define DM_MSG_PREFIX "crypt"
#define MESG_STR(x) x, sizeof(x) #define MESG_STR(x) x, sizeof(x)
/*
* per bio private data
*/
struct dm_crypt_io {
struct dm_target *target;
struct bio *base_bio;
struct work_struct work;
atomic_t pending;
int error;
};
/* /*
* context holding the current state of a multi-part conversion * context holding the current state of a multi-part conversion
*/ */
struct convert_context { struct convert_context {
struct completion restart;
struct bio *bio_in; struct bio *bio_in;
struct bio *bio_out; struct bio *bio_out;
unsigned int offset_in; unsigned int offset_in;
@ -49,7 +40,27 @@ struct convert_context {
unsigned int idx_in; unsigned int idx_in;
unsigned int idx_out; unsigned int idx_out;
sector_t sector; sector_t sector;
int write; atomic_t pending;
};
/*
* per bio private data
*/
struct dm_crypt_io {
struct dm_target *target;
struct bio *base_bio;
struct work_struct work;
struct convert_context ctx;
atomic_t pending;
int error;
sector_t sector;
};
struct dm_crypt_request {
struct scatterlist sg_in;
struct scatterlist sg_out;
}; };
struct crypt_config; struct crypt_config;
@ -72,10 +83,11 @@ struct crypt_config {
sector_t start; sector_t start;
/* /*
* pool for per bio private data and * pool for per bio private data, crypto requests and
* for encryption buffer pages * encryption requeusts/buffer pages
*/ */
mempool_t *io_pool; mempool_t *io_pool;
mempool_t *req_pool;
mempool_t *page_pool; mempool_t *page_pool;
struct bio_set *bs; struct bio_set *bs;
@ -93,9 +105,25 @@ struct crypt_config {
sector_t iv_offset; sector_t iv_offset;
unsigned int iv_size; unsigned int iv_size;
/*
* Layout of each crypto request:
*
* struct ablkcipher_request
* context
* padding
* struct dm_crypt_request
* padding
* IV
*
* The padding is added so that dm_crypt_request and the IV are
* correctly aligned.
*/
unsigned int dmreq_start;
struct ablkcipher_request *req;
char cipher[CRYPTO_MAX_ALG_NAME]; char cipher[CRYPTO_MAX_ALG_NAME];
char chainmode[CRYPTO_MAX_ALG_NAME]; char chainmode[CRYPTO_MAX_ALG_NAME];
struct crypto_blkcipher *tfm; struct crypto_ablkcipher *tfm;
unsigned long flags; unsigned long flags;
unsigned int key_size; unsigned int key_size;
u8 key[0]; u8 key[0];
@ -108,6 +136,7 @@ struct crypt_config {
static struct kmem_cache *_crypt_io_pool; static struct kmem_cache *_crypt_io_pool;
static void clone_init(struct dm_crypt_io *, struct bio *); static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
/* /*
* Different IV generation algorithms: * Different IV generation algorithms:
@ -188,7 +217,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
return PTR_ERR(essiv_tfm); return PTR_ERR(essiv_tfm);
} }
if (crypto_cipher_blocksize(essiv_tfm) != if (crypto_cipher_blocksize(essiv_tfm) !=
crypto_blkcipher_ivsize(cc->tfm)) { crypto_ablkcipher_ivsize(cc->tfm)) {
ti->error = "Block size of ESSIV cipher does " ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher"; "not match IV size of block cipher";
crypto_free_cipher(essiv_tfm); crypto_free_cipher(essiv_tfm);
@ -225,7 +254,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts) const char *opts)
{ {
unsigned int bs = crypto_blkcipher_blocksize(cc->tfm); unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
int log = ilog2(bs); int log = ilog2(bs);
/* we need to calculate how far we must shift the sector count /* we need to calculate how far we must shift the sector count
@ -289,42 +318,10 @@ static struct crypt_iv_operations crypt_iv_null_ops = {
.generator = crypt_iv_null_gen .generator = crypt_iv_null_gen
}; };
static int
crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
struct scatterlist *in, unsigned int length,
int write, sector_t sector)
{
u8 iv[cc->iv_size] __attribute__ ((aligned(__alignof__(u64))));
struct blkcipher_desc desc = {
.tfm = cc->tfm,
.info = iv,
.flags = CRYPTO_TFM_REQ_MAY_SLEEP,
};
int r;
if (cc->iv_gen_ops) {
r = cc->iv_gen_ops->generator(cc, iv, sector);
if (r < 0)
return r;
if (write)
r = crypto_blkcipher_encrypt_iv(&desc, out, in, length);
else
r = crypto_blkcipher_decrypt_iv(&desc, out, in, length);
} else {
if (write)
r = crypto_blkcipher_encrypt(&desc, out, in, length);
else
r = crypto_blkcipher_decrypt(&desc, out, in, length);
}
return r;
}
static void crypt_convert_init(struct crypt_config *cc, static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx, struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in, struct bio *bio_out, struct bio *bio_in,
sector_t sector, int write) sector_t sector)
{ {
ctx->bio_in = bio_in; ctx->bio_in = bio_in;
ctx->bio_out = bio_out; ctx->bio_out = bio_out;
@ -333,7 +330,79 @@ static void crypt_convert_init(struct crypt_config *cc,
ctx->idx_in = bio_in ? bio_in->bi_idx : 0; ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
ctx->idx_out = bio_out ? bio_out->bi_idx : 0; ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
ctx->sector = sector + cc->iv_offset; ctx->sector = sector + cc->iv_offset;
ctx->write = write; init_completion(&ctx->restart);
/*
* Crypto operation can be asynchronous,
* ctx->pending is increased after request submission.
* We need to ensure that we don't call the crypt finish
* operation before pending got incremented
* (dependent on crypt submission return code).
*/
atomic_set(&ctx->pending, 2);
}
static int crypt_convert_block(struct crypt_config *cc,
struct convert_context *ctx,
struct ablkcipher_request *req)
{
struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
struct dm_crypt_request *dmreq;
u8 *iv;
int r = 0;
dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start);
iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
crypto_ablkcipher_alignmask(cc->tfm) + 1);
sg_init_table(&dmreq->sg_in, 1);
sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
bv_in->bv_offset + ctx->offset_in);
sg_init_table(&dmreq->sg_out, 1);
sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT,
bv_out->bv_offset + ctx->offset_out);
ctx->offset_in += 1 << SECTOR_SHIFT;
if (ctx->offset_in >= bv_in->bv_len) {
ctx->offset_in = 0;
ctx->idx_in++;
}
ctx->offset_out += 1 << SECTOR_SHIFT;
if (ctx->offset_out >= bv_out->bv_len) {
ctx->offset_out = 0;
ctx->idx_out++;
}
if (cc->iv_gen_ops) {
r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
if (r < 0)
return r;
}
ablkcipher_request_set_crypt(req, &dmreq->sg_in, &dmreq->sg_out,
1 << SECTOR_SHIFT, iv);
if (bio_data_dir(ctx->bio_in) == WRITE)
r = crypto_ablkcipher_encrypt(req);
else
r = crypto_ablkcipher_decrypt(req);
return r;
}
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error);
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
if (!cc->req)
cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
ablkcipher_request_set_tfm(cc->req, cc->tfm);
ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
CRYPTO_TFM_REQ_MAY_SLEEP,
kcryptd_async_done, ctx);
} }
/* /*
@ -346,36 +415,38 @@ static int crypt_convert(struct crypt_config *cc,
while(ctx->idx_in < ctx->bio_in->bi_vcnt && while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
ctx->idx_out < ctx->bio_out->bi_vcnt) { ctx->idx_out < ctx->bio_out->bi_vcnt) {
struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
struct scatterlist sg_in, sg_out;
sg_init_table(&sg_in, 1); crypt_alloc_req(cc, ctx);
sg_set_page(&sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, bv_in->bv_offset + ctx->offset_in);
sg_init_table(&sg_out, 1); r = crypt_convert_block(cc, ctx, cc->req);
sg_set_page(&sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, bv_out->bv_offset + ctx->offset_out);
ctx->offset_in += sg_in.length; switch (r) {
if (ctx->offset_in >= bv_in->bv_len) { case -EBUSY:
ctx->offset_in = 0; wait_for_completion(&ctx->restart);
ctx->idx_in++; INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
atomic_inc(&ctx->pending);
cc->req = NULL;
r = 0;
/* fall through*/
case 0:
ctx->sector++;
continue;
} }
ctx->offset_out += sg_out.length; break;
if (ctx->offset_out >= bv_out->bv_len) {
ctx->offset_out = 0;
ctx->idx_out++;
}
r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
ctx->write, ctx->sector);
if (r < 0)
break;
ctx->sector++;
} }
/*
* If there are pending crypto operation run async
* code. Otherwise process return code synchronously.
* The step of 2 ensures that async finish doesn't
* call crypto finish too early.
*/
if (atomic_sub_return(2, &ctx->pending))
return -EINPROGRESS;
return r; return r;
} }
@ -455,18 +526,14 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
* One of the bios was finished. Check for completion of * One of the bios was finished. Check for completion of
* the whole request and correctly clean up the buffer. * the whole request and correctly clean up the buffer.
*/ */
static void crypt_dec_pending(struct dm_crypt_io *io, int error) static void crypt_dec_pending(struct dm_crypt_io *io)
{ {
struct crypt_config *cc = (struct crypt_config *) io->target->private; struct crypt_config *cc = io->target->private;
if (error < 0)
io->error = error;
if (!atomic_dec_and_test(&io->pending)) if (!atomic_dec_and_test(&io->pending))
return; return;
bio_endio(io->base_bio, io->error); bio_endio(io->base_bio, io->error);
mempool_free(io, cc->io_pool); mempool_free(io, cc->io_pool);
} }
@ -484,30 +551,11 @@ static void crypt_dec_pending(struct dm_crypt_io *io, int error)
* starved by new requests which can block in the first stages due * starved by new requests which can block in the first stages due
* to memory allocation. * to memory allocation.
*/ */
static void kcryptd_do_work(struct work_struct *work);
static void kcryptd_do_crypt(struct work_struct *work);
static void kcryptd_queue_io(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->target->private;
INIT_WORK(&io->work, kcryptd_do_work);
queue_work(cc->io_queue, &io->work);
}
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->target->private;
INIT_WORK(&io->work, kcryptd_do_crypt);
queue_work(cc->crypt_queue, &io->work);
}
static void crypt_endio(struct bio *clone, int error) static void crypt_endio(struct bio *clone, int error)
{ {
struct dm_crypt_io *io = clone->bi_private; struct dm_crypt_io *io = clone->bi_private;
struct crypt_config *cc = io->target->private; struct crypt_config *cc = io->target->private;
unsigned read_io = bio_data_dir(clone) == READ; unsigned rw = bio_data_dir(clone);
if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error))
error = -EIO; error = -EIO;
@ -515,21 +563,20 @@ static void crypt_endio(struct bio *clone, int error)
/* /*
* free the processed pages * free the processed pages
*/ */
if (!read_io) { if (rw == WRITE)
crypt_free_buffer_pages(cc, clone); crypt_free_buffer_pages(cc, clone);
goto out;
bio_put(clone);
if (rw == READ && !error) {
kcryptd_queue_crypt(io);
return;
} }
if (unlikely(error)) if (unlikely(error))
goto out; io->error = error;
bio_put(clone); crypt_dec_pending(io);
kcryptd_queue_crypt(io);
return;
out:
bio_put(clone);
crypt_dec_pending(io, error);
} }
static void clone_init(struct dm_crypt_io *io, struct bio *clone) static void clone_init(struct dm_crypt_io *io, struct bio *clone)
@ -543,12 +590,11 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_destructor = dm_crypt_bio_destructor; clone->bi_destructor = dm_crypt_bio_destructor;
} }
static void process_read(struct dm_crypt_io *io) static void kcryptd_io_read(struct dm_crypt_io *io)
{ {
struct crypt_config *cc = io->target->private; struct crypt_config *cc = io->target->private;
struct bio *base_bio = io->base_bio; struct bio *base_bio = io->base_bio;
struct bio *clone; struct bio *clone;
sector_t sector = base_bio->bi_sector - io->target->begin;
atomic_inc(&io->pending); atomic_inc(&io->pending);
@ -559,7 +605,8 @@ static void process_read(struct dm_crypt_io *io)
*/ */
clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
if (unlikely(!clone)) { if (unlikely(!clone)) {
crypt_dec_pending(io, -ENOMEM); io->error = -ENOMEM;
crypt_dec_pending(io);
return; return;
} }
@ -567,25 +614,71 @@ static void process_read(struct dm_crypt_io *io)
clone->bi_idx = 0; clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio); clone->bi_vcnt = bio_segments(base_bio);
clone->bi_size = base_bio->bi_size; clone->bi_size = base_bio->bi_size;
clone->bi_sector = cc->start + sector; clone->bi_sector = cc->start + io->sector;
memcpy(clone->bi_io_vec, bio_iovec(base_bio), memcpy(clone->bi_io_vec, bio_iovec(base_bio),
sizeof(struct bio_vec) * clone->bi_vcnt); sizeof(struct bio_vec) * clone->bi_vcnt);
generic_make_request(clone); generic_make_request(clone);
} }
static void process_write(struct dm_crypt_io *io) static void kcryptd_io_write(struct dm_crypt_io *io)
{
struct bio *clone = io->ctx.bio_out;
generic_make_request(clone);
}
static void kcryptd_io(struct work_struct *work)
{
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
if (bio_data_dir(io->base_bio) == READ)
kcryptd_io_read(io);
else
kcryptd_io_write(io);
}
static void kcryptd_queue_io(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->target->private;
INIT_WORK(&io->work, kcryptd_io);
queue_work(cc->io_queue, &io->work);
}
static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
int error, int async)
{
struct bio *clone = io->ctx.bio_out;
struct crypt_config *cc = io->target->private;
if (unlikely(error < 0)) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
io->error = -EIO;
return;
}
/* crypt_convert should have filled the clone bio */
BUG_ON(io->ctx.idx_out < clone->bi_vcnt);
clone->bi_sector = cc->start + io->sector;
io->sector += bio_sectors(clone);
if (async)
kcryptd_queue_io(io);
else {
atomic_inc(&io->pending);
generic_make_request(clone);
}
}
static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io)
{ {
struct crypt_config *cc = io->target->private; struct crypt_config *cc = io->target->private;
struct bio *base_bio = io->base_bio;
struct bio *clone; struct bio *clone;
struct convert_context ctx; unsigned remaining = io->base_bio->bi_size;
unsigned remaining = base_bio->bi_size; int r;
sector_t sector = base_bio->bi_sector - io->target->begin;
atomic_inc(&io->pending);
crypt_convert_init(cc, &ctx, NULL, base_bio, sector, 1);
/* /*
* The allocated buffers can be smaller than the whole bio, * The allocated buffers can be smaller than the whole bio,
@ -594,70 +687,110 @@ static void process_write(struct dm_crypt_io *io)
while (remaining) { while (remaining) {
clone = crypt_alloc_buffer(io, remaining); clone = crypt_alloc_buffer(io, remaining);
if (unlikely(!clone)) { if (unlikely(!clone)) {
crypt_dec_pending(io, -ENOMEM); io->error = -ENOMEM;
return; return;
} }
ctx.bio_out = clone; io->ctx.bio_out = clone;
ctx.idx_out = 0; io->ctx.idx_out = 0;
if (unlikely(crypt_convert(cc, &ctx) < 0)) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
crypt_dec_pending(io, -EIO);
return;
}
/* crypt_convert should have filled the clone bio */
BUG_ON(ctx.idx_out < clone->bi_vcnt);
clone->bi_sector = cc->start + sector;
remaining -= clone->bi_size; remaining -= clone->bi_size;
sector += bio_sectors(clone);
/* Grab another reference to the io struct r = crypt_convert(cc, &io->ctx);
* before we kick off the request */
if (remaining) if (r != -EINPROGRESS) {
kcryptd_crypt_write_io_submit(io, r, 0);
if (unlikely(r < 0))
return;
} else
atomic_inc(&io->pending); atomic_inc(&io->pending);
generic_make_request(clone);
/* Do not reference clone after this - it
* may be gone already. */
/* out of memory -> run queues */ /* out of memory -> run queues */
if (remaining) if (unlikely(remaining))
congestion_wait(WRITE, HZ/100); congestion_wait(WRITE, HZ/100);
} }
} }
static void process_read_endio(struct dm_crypt_io *io) static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
{ {
struct crypt_config *cc = io->target->private; struct crypt_config *cc = io->target->private;
struct convert_context ctx;
crypt_convert_init(cc, &ctx, io->base_bio, io->base_bio, /*
io->base_bio->bi_sector - io->target->begin, 0); * Prevent io from disappearing until this function completes.
*/
atomic_inc(&io->pending);
crypt_dec_pending(io, crypt_convert(cc, &ctx)); crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector);
kcryptd_crypt_write_convert_loop(io);
crypt_dec_pending(io);
} }
static void kcryptd_do_work(struct work_struct *work) static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
{ {
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); if (unlikely(error < 0))
io->error = -EIO;
if (bio_data_dir(io->base_bio) == READ) crypt_dec_pending(io);
process_read(io);
} }
static void kcryptd_do_crypt(struct work_struct *work) static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
{ {
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); struct crypt_config *cc = io->target->private;
int r = 0;
atomic_inc(&io->pending);
crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio,
io->sector);
r = crypt_convert(cc, &io->ctx);
if (r != -EINPROGRESS)
kcryptd_crypt_read_done(io, r);
crypt_dec_pending(io);
}
static void kcryptd_async_done(struct crypto_async_request *async_req,
int error)
{
struct convert_context *ctx = async_req->data;
struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
struct crypt_config *cc = io->target->private;
if (error == -EINPROGRESS) {
complete(&ctx->restart);
return;
}
mempool_free(ablkcipher_request_cast(async_req), cc->req_pool);
if (!atomic_dec_and_test(&ctx->pending))
return;
if (bio_data_dir(io->base_bio) == READ) if (bio_data_dir(io->base_bio) == READ)
process_read_endio(io); kcryptd_crypt_read_done(io, error);
else else
process_write(io); kcryptd_crypt_write_io_submit(io, error, 1);
}
static void kcryptd_crypt(struct work_struct *work)
{
struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
if (bio_data_dir(io->base_bio) == READ)
kcryptd_crypt_read_convert(io);
else
kcryptd_crypt_write_convert(io);
}
static void kcryptd_queue_crypt(struct dm_crypt_io *io)
{
struct crypt_config *cc = io->target->private;
INIT_WORK(&io->work, kcryptd_crypt);
queue_work(cc->crypt_queue, &io->work);
} }
/* /*
@ -733,7 +866,7 @@ static int crypt_wipe_key(struct crypt_config *cc)
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ {
struct crypt_config *cc; struct crypt_config *cc;
struct crypto_blkcipher *tfm; struct crypto_ablkcipher *tfm;
char *tmp; char *tmp;
char *cipher; char *cipher;
char *chainmode; char *chainmode;
@ -787,7 +920,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_cipher; goto bad_cipher;
} }
tfm = crypto_alloc_blkcipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0);
if (IS_ERR(tfm)) { if (IS_ERR(tfm)) {
ti->error = "Error allocating crypto tfm"; ti->error = "Error allocating crypto tfm";
goto bad_cipher; goto bad_cipher;
@ -821,7 +954,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
goto bad_ivmode; goto bad_ivmode;
cc->iv_size = crypto_blkcipher_ivsize(tfm); cc->iv_size = crypto_ablkcipher_ivsize(tfm);
if (cc->iv_size) if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */ /* at least a 64 bit sector number should fit in our buffer */
cc->iv_size = max(cc->iv_size, cc->iv_size = max(cc->iv_size,
@ -841,6 +974,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_slab_pool; goto bad_slab_pool;
} }
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(tfm);
cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) &
~(crypto_tfm_ctx_alignment() - 1);
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
sizeof(struct dm_crypt_request) + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad_req_pool;
}
cc->req = NULL;
cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
if (!cc->page_pool) { if (!cc->page_pool) {
ti->error = "Cannot allocate page mempool"; ti->error = "Cannot allocate page mempool";
@ -853,7 +1000,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_bs; goto bad_bs;
} }
if (crypto_blkcipher_setkey(tfm, cc->key, key_size) < 0) { if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
ti->error = "Error setting key"; ti->error = "Error setting key";
goto bad_device; goto bad_device;
} }
@ -914,12 +1061,14 @@ bad_device:
bad_bs: bad_bs:
mempool_destroy(cc->page_pool); mempool_destroy(cc->page_pool);
bad_page_pool: bad_page_pool:
mempool_destroy(cc->req_pool);
bad_req_pool:
mempool_destroy(cc->io_pool); mempool_destroy(cc->io_pool);
bad_slab_pool: bad_slab_pool:
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc); cc->iv_gen_ops->dtr(cc);
bad_ivmode: bad_ivmode:
crypto_free_blkcipher(tfm); crypto_free_ablkcipher(tfm);
bad_cipher: bad_cipher:
/* Must zero key material before freeing */ /* Must zero key material before freeing */
memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
@ -934,14 +1083,18 @@ static void crypt_dtr(struct dm_target *ti)
destroy_workqueue(cc->io_queue); destroy_workqueue(cc->io_queue);
destroy_workqueue(cc->crypt_queue); destroy_workqueue(cc->crypt_queue);
if (cc->req)
mempool_free(cc->req, cc->req_pool);
bioset_free(cc->bs); bioset_free(cc->bs);
mempool_destroy(cc->page_pool); mempool_destroy(cc->page_pool);
mempool_destroy(cc->req_pool);
mempool_destroy(cc->io_pool); mempool_destroy(cc->io_pool);
kfree(cc->iv_mode); kfree(cc->iv_mode);
if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
cc->iv_gen_ops->dtr(cc); cc->iv_gen_ops->dtr(cc);
crypto_free_blkcipher(cc->tfm); crypto_free_ablkcipher(cc->tfm);
dm_put_device(ti, cc->dev); dm_put_device(ti, cc->dev);
/* Must zero key material before freeing */ /* Must zero key material before freeing */
@ -958,6 +1111,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
io = mempool_alloc(cc->io_pool, GFP_NOIO); io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti; io->target = ti;
io->base_bio = bio; io->base_bio = bio;
io->sector = bio->bi_sector - ti->begin;
io->error = 0; io->error = 0;
atomic_set(&io->pending, 0); atomic_set(&io->pending, 0);

View File

@ -449,7 +449,7 @@ static void persistent_destroy(struct exception_store *store)
static int persistent_read_metadata(struct exception_store *store) static int persistent_read_metadata(struct exception_store *store)
{ {
int r, new_snapshot; int r, uninitialized_var(new_snapshot);
struct pstore *ps = get_info(store); struct pstore *ps = get_info(store);
/* /*

View File

@ -15,6 +15,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/dm-ioctl.h> #include <linux/dm-ioctl.h>
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/compat.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
@ -702,7 +703,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
int r; int r;
char *new_name = (char *) param + param->data_start; char *new_name = (char *) param + param->data_start;
if (new_name < (char *) param->data || if (new_name < param->data ||
invalid_str(new_name, (void *) param + param_size)) { invalid_str(new_name, (void *) param + param_size)) {
DMWARN("Invalid new logical volume name supplied."); DMWARN("Invalid new logical volume name supplied.");
return -EINVAL; return -EINVAL;
@ -728,7 +729,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
if (!md) if (!md)
return -ENXIO; return -ENXIO;
if (geostr < (char *) param->data || if (geostr < param->data ||
invalid_str(geostr, (void *) param + param_size)) { invalid_str(geostr, (void *) param + param_size)) {
DMWARN("Invalid geometry supplied."); DMWARN("Invalid geometry supplied.");
goto out; goto out;
@ -1350,10 +1351,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
{ {
struct dm_ioctl tmp, *dmi; struct dm_ioctl tmp, *dmi;
if (copy_from_user(&tmp, user, sizeof(tmp))) if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data)))
return -EFAULT; return -EFAULT;
if (tmp.data_size < sizeof(tmp)) if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data)))
return -EINVAL; return -EINVAL;
dmi = vmalloc(tmp.data_size); dmi = vmalloc(tmp.data_size);
@ -1397,13 +1398,11 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
return 0; return 0;
} }
static int ctl_ioctl(struct inode *inode, struct file *file, static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
uint command, ulong u)
{ {
int r = 0; int r = 0;
unsigned int cmd; unsigned int cmd;
struct dm_ioctl *param; struct dm_ioctl *uninitialized_var(param);
struct dm_ioctl __user *user = (struct dm_ioctl __user *) u;
ioctl_fn fn = NULL; ioctl_fn fn = NULL;
size_t param_size; size_t param_size;
@ -1471,8 +1470,23 @@ static int ctl_ioctl(struct inode *inode, struct file *file,
return r; return r;
} }
static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
{
return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
}
#ifdef CONFIG_COMPAT
static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
{
return (long)dm_ctl_ioctl(file, command, (ulong) compat_ptr(u));
}
#else
#define dm_compat_ctl_ioctl NULL
#endif
static const struct file_operations _ctl_fops = { static const struct file_operations _ctl_fops = {
.ioctl = ctl_ioctl, .unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE, .owner = THIS_MODULE,
}; };

View File

@ -41,7 +41,7 @@ int dm_unregister_dirty_log_type(struct dirty_log_type *type)
return 0; return 0;
} }
static struct dirty_log_type *get_type(const char *type_name) static struct dirty_log_type *_get_type(const char *type_name)
{ {
struct dirty_log_type *type; struct dirty_log_type *type;
@ -61,6 +61,55 @@ static struct dirty_log_type *get_type(const char *type_name)
return NULL; return NULL;
} }
/*
* get_type
* @type_name
*
* Attempt to retrieve the dirty_log_type by name. If not already
* available, attempt to load the appropriate module.
*
* Log modules are named "dm-log-" followed by the 'type_name'.
* Modules may contain multiple types.
* This function will first try the module "dm-log-<type_name>",
* then truncate 'type_name' on the last '-' and try again.
*
* For example, if type_name was "clustered-disk", it would search
* 'dm-log-clustered-disk' then 'dm-log-clustered'.
*
* Returns: dirty_log_type* on success, NULL on failure
*/
static struct dirty_log_type *get_type(const char *type_name)
{
char *p, *type_name_dup;
struct dirty_log_type *type;
type = _get_type(type_name);
if (type)
return type;
type_name_dup = kstrdup(type_name, GFP_KERNEL);
if (!type_name_dup) {
DMWARN("No memory left to attempt log module load for \"%s\"",
type_name);
return NULL;
}
while (request_module("dm-log-%s", type_name_dup) ||
!(type = _get_type(type_name))) {
p = strrchr(type_name_dup, '-');
if (!p)
break;
p[0] = '\0';
}
if (!type)
DMWARN("Module for logging type \"%s\" not found.", type_name);
kfree(type_name_dup);
return type;
}
static void put_type(struct dirty_log_type *type) static void put_type(struct dirty_log_type *type)
{ {
spin_lock(&_lock); spin_lock(&_lock);

View File

@ -106,7 +106,7 @@ typedef int (*action_fn) (struct pgpath *pgpath);
static struct kmem_cache *_mpio_cache; static struct kmem_cache *_mpio_cache;
struct workqueue_struct *kmultipathd; static struct workqueue_struct *kmultipathd;
static void process_queued_ios(struct work_struct *work); static void process_queued_ios(struct work_struct *work);
static void trigger_event(struct work_struct *work); static void trigger_event(struct work_struct *work);

View File

@ -6,6 +6,7 @@
#include "dm.h" #include "dm.h"
#include "dm-bio-list.h" #include "dm-bio-list.h"
#include "dm-bio-record.h"
#include "dm-io.h" #include "dm-io.h"
#include "dm-log.h" #include "dm-log.h"
#include "kcopyd.h" #include "kcopyd.h"
@ -20,6 +21,7 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/log2.h> #include <linux/log2.h>
#include <linux/hardirq.h>
#define DM_MSG_PREFIX "raid1" #define DM_MSG_PREFIX "raid1"
#define DM_IO_PAGES 64 #define DM_IO_PAGES 64
@ -113,9 +115,16 @@ struct region {
/*----------------------------------------------------------------- /*-----------------------------------------------------------------
* Mirror set structures. * Mirror set structures.
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
enum dm_raid1_error {
DM_RAID1_WRITE_ERROR,
DM_RAID1_SYNC_ERROR,
DM_RAID1_READ_ERROR
};
struct mirror { struct mirror {
struct mirror_set *ms; struct mirror_set *ms;
atomic_t error_count; atomic_t error_count;
uint32_t error_type;
struct dm_dev *dev; struct dm_dev *dev;
sector_t offset; sector_t offset;
}; };
@ -127,21 +136,25 @@ struct mirror_set {
struct kcopyd_client *kcopyd_client; struct kcopyd_client *kcopyd_client;
uint64_t features; uint64_t features;
spinlock_t lock; /* protects the next two lists */ spinlock_t lock; /* protects the lists */
struct bio_list reads; struct bio_list reads;
struct bio_list writes; struct bio_list writes;
struct bio_list failures;
struct dm_io_client *io_client; struct dm_io_client *io_client;
mempool_t *read_record_pool;
/* recovery */ /* recovery */
region_t nr_regions; region_t nr_regions;
int in_sync; int in_sync;
int log_failure; int log_failure;
atomic_t suspend;
struct mirror *default_mirror; /* Default mirror */ atomic_t default_mirror; /* Default mirror */
struct workqueue_struct *kmirrord_wq; struct workqueue_struct *kmirrord_wq;
struct work_struct kmirrord_work; struct work_struct kmirrord_work;
struct work_struct trigger_event;
unsigned int nr_mirrors; unsigned int nr_mirrors;
struct mirror mirror[0]; struct mirror mirror[0];
@ -362,6 +375,16 @@ static void complete_resync_work(struct region *reg, int success)
struct region_hash *rh = reg->rh; struct region_hash *rh = reg->rh;
rh->log->type->set_region_sync(rh->log, reg->key, success); rh->log->type->set_region_sync(rh->log, reg->key, success);
/*
* Dispatch the bios before we call 'wake_up_all'.
* This is important because if we are suspending,
* we want to know that recovery is complete and
* the work queue is flushed. If we wake_up_all
* before we dispatch_bios (queue bios and call wake()),
* then we risk suspending before the work queue
* has been properly flushed.
*/
dispatch_bios(rh->ms, &reg->delayed_bios); dispatch_bios(rh->ms, &reg->delayed_bios);
if (atomic_dec_and_test(&rh->recovery_in_flight)) if (atomic_dec_and_test(&rh->recovery_in_flight))
wake_up_all(&_kmirrord_recovery_stopped); wake_up_all(&_kmirrord_recovery_stopped);
@ -626,24 +649,101 @@ static void rh_start_recovery(struct region_hash *rh)
wake(rh->ms); wake(rh->ms);
} }
#define MIN_READ_RECORDS 20
struct dm_raid1_read_record {
struct mirror *m;
struct dm_bio_details details;
};
/* /*
* Every mirror should look like this one. * Every mirror should look like this one.
*/ */
#define DEFAULT_MIRROR 0 #define DEFAULT_MIRROR 0
/* /*
* This is yucky. We squirrel the mirror_set struct away inside * This is yucky. We squirrel the mirror struct away inside
* bi_next for write buffers. This is safe since the bh * bi_next for read/write buffers. This is safe since the bh
* doesn't get submitted to the lower levels of block layer. * doesn't get submitted to the lower levels of block layer.
*/ */
static struct mirror_set *bio_get_ms(struct bio *bio) static struct mirror *bio_get_m(struct bio *bio)
{ {
return (struct mirror_set *) bio->bi_next; return (struct mirror *) bio->bi_next;
} }
static void bio_set_ms(struct bio *bio, struct mirror_set *ms) static void bio_set_m(struct bio *bio, struct mirror *m)
{ {
bio->bi_next = (struct bio *) ms; bio->bi_next = (struct bio *) m;
}
static struct mirror *get_default_mirror(struct mirror_set *ms)
{
return &ms->mirror[atomic_read(&ms->default_mirror)];
}
static void set_default_mirror(struct mirror *m)
{
struct mirror_set *ms = m->ms;
struct mirror *m0 = &(ms->mirror[0]);
atomic_set(&ms->default_mirror, m - m0);
}
/* fail_mirror
* @m: mirror device to fail
* @error_type: one of the enum's, DM_RAID1_*_ERROR
*
* If errors are being handled, record the type of
* error encountered for this device. If this type
* of error has already been recorded, we can return;
* otherwise, we must signal userspace by triggering
* an event. Additionally, if the device is the
* primary device, we must choose a new primary, but
* only if the mirror is in-sync.
*
* This function must not block.
*/
static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
{
struct mirror_set *ms = m->ms;
struct mirror *new;
if (!errors_handled(ms))
return;
/*
* error_count is used for nothing more than a
* simple way to tell if a device has encountered
* errors.
*/
atomic_inc(&m->error_count);
if (test_and_set_bit(error_type, &m->error_type))
return;
if (m != get_default_mirror(ms))
goto out;
if (!ms->in_sync) {
/*
* Better to issue requests to same failing device
* than to risk returning corrupt data.
*/
DMERR("Primary mirror (%s) failed while out-of-sync: "
"Reads may fail.", m->dev->name);
goto out;
}
for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
if (!atomic_read(&new->error_count)) {
set_default_mirror(new);
break;
}
if (unlikely(new == ms->mirror + ms->nr_mirrors))
DMWARN("All sides of mirror have failed.");
out:
schedule_work(&ms->trigger_event);
} }
/*----------------------------------------------------------------- /*-----------------------------------------------------------------
@ -656,15 +756,32 @@ static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
static void recovery_complete(int read_err, unsigned int write_err, static void recovery_complete(int read_err, unsigned int write_err,
void *context) void *context)
{ {
struct region *reg = (struct region *) context; struct region *reg = (struct region *)context;
struct mirror_set *ms = reg->rh->ms;
int m, bit = 0;
if (read_err) if (read_err) {
/* Read error means the failure of default mirror. */ /* Read error means the failure of default mirror. */
DMERR_LIMIT("Unable to read primary mirror during recovery"); DMERR_LIMIT("Unable to read primary mirror during recovery");
fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
}
if (write_err) if (write_err) {
DMERR_LIMIT("Write error during recovery (error = 0x%x)", DMERR_LIMIT("Write error during recovery (error = 0x%x)",
write_err); write_err);
/*
* Bits correspond to devices (excluding default mirror).
* The default mirror cannot change during recovery.
*/
for (m = 0; m < ms->nr_mirrors; m++) {
if (&ms->mirror[m] == get_default_mirror(ms))
continue;
if (test_bit(bit, &write_err))
fail_mirror(ms->mirror + m,
DM_RAID1_SYNC_ERROR);
bit++;
}
}
rh_recovery_end(reg, !(read_err || write_err)); rh_recovery_end(reg, !(read_err || write_err));
} }
@ -678,7 +795,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
unsigned long flags = 0; unsigned long flags = 0;
/* fill in the source */ /* fill in the source */
m = ms->default_mirror; m = get_default_mirror(ms);
from.bdev = m->dev->bdev; from.bdev = m->dev->bdev;
from.sector = m->offset + region_to_sector(reg->rh, reg->key); from.sector = m->offset + region_to_sector(reg->rh, reg->key);
if (reg->key == (ms->nr_regions - 1)) { if (reg->key == (ms->nr_regions - 1)) {
@ -694,7 +811,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
/* fill in the destinations */ /* fill in the destinations */
for (i = 0, dest = to; i < ms->nr_mirrors; i++) { for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
if (&ms->mirror[i] == ms->default_mirror) if (&ms->mirror[i] == get_default_mirror(ms))
continue; continue;
m = ms->mirror + i; m = ms->mirror + i;
@ -748,17 +865,105 @@ static void do_recovery(struct mirror_set *ms)
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
{ {
/* FIXME: add read balancing */ struct mirror *m = get_default_mirror(ms);
return ms->default_mirror;
do {
if (likely(!atomic_read(&m->error_count)))
return m;
if (m-- == ms->mirror)
m += ms->nr_mirrors;
} while (m != get_default_mirror(ms));
return NULL;
}
static int default_ok(struct mirror *m)
{
struct mirror *default_mirror = get_default_mirror(m->ms);
return !atomic_read(&default_mirror->error_count);
}
static int mirror_available(struct mirror_set *ms, struct bio *bio)
{
region_t region = bio_to_region(&ms->rh, bio);
if (ms->rh.log->type->in_sync(ms->rh.log, region, 0))
return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
return 0;
} }
/* /*
* remap a buffer to a particular mirror. * remap a buffer to a particular mirror.
*/ */
static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) static sector_t map_sector(struct mirror *m, struct bio *bio)
{
return m->offset + (bio->bi_sector - m->ms->ti->begin);
}
static void map_bio(struct mirror *m, struct bio *bio)
{ {
bio->bi_bdev = m->dev->bdev; bio->bi_bdev = m->dev->bdev;
bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); bio->bi_sector = map_sector(m, bio);
}
static void map_region(struct io_region *io, struct mirror *m,
struct bio *bio)
{
io->bdev = m->dev->bdev;
io->sector = map_sector(m, bio);
io->count = bio->bi_size >> 9;
}
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
static void read_callback(unsigned long error, void *context)
{
struct bio *bio = context;
struct mirror *m;
m = bio_get_m(bio);
bio_set_m(bio, NULL);
if (likely(!error)) {
bio_endio(bio, 0);
return;
}
fail_mirror(m, DM_RAID1_READ_ERROR);
if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
DMWARN_LIMIT("Read failure on mirror device %s. "
"Trying alternative device.",
m->dev->name);
queue_bio(m->ms, bio, bio_rw(bio));
return;
}
DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
m->dev->name);
bio_endio(bio, -EIO);
}
/* Asynchronous read. */
static void read_async_bio(struct mirror *m, struct bio *bio)
{
struct io_region io;
struct dm_io_request io_req = {
.bi_rw = READ,
.mem.type = DM_IO_BVEC,
.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
.notify.fn = read_callback,
.notify.context = bio,
.client = m->ms->io_client,
};
map_region(&io, m, bio);
bio_set_m(bio, m);
(void) dm_io(&io_req, 1, &io, NULL);
} }
static void do_reads(struct mirror_set *ms, struct bio_list *reads) static void do_reads(struct mirror_set *ms, struct bio_list *reads)
@ -769,17 +974,20 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
while ((bio = bio_list_pop(reads))) { while ((bio = bio_list_pop(reads))) {
region = bio_to_region(&ms->rh, bio); region = bio_to_region(&ms->rh, bio);
m = get_default_mirror(ms);
/* /*
* We can only read balance if the region is in sync. * We can only read balance if the region is in sync.
*/ */
if (rh_in_sync(&ms->rh, region, 1)) if (likely(rh_in_sync(&ms->rh, region, 1)))
m = choose_mirror(ms, bio->bi_sector); m = choose_mirror(ms, bio->bi_sector);
else else if (m && atomic_read(&m->error_count))
m = ms->default_mirror; m = NULL;
map_bio(ms, m, bio); if (likely(m))
generic_make_request(bio); read_async_bio(m, bio);
else
bio_endio(bio, -EIO);
} }
} }
@ -793,15 +1001,70 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
* RECOVERING: delay the io until recovery completes * RECOVERING: delay the io until recovery completes
* NOSYNC: increment pending, just write to the default mirror * NOSYNC: increment pending, just write to the default mirror
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
/* __bio_mark_nosync
* @ms
* @bio
* @done
* @error
*
* The bio was written on some mirror(s) but failed on other mirror(s).
* We can successfully endio the bio but should avoid the region being
* marked clean by setting the state RH_NOSYNC.
*
* This function is _not_ safe in interrupt context!
*/
static void __bio_mark_nosync(struct mirror_set *ms,
struct bio *bio, unsigned done, int error)
{
unsigned long flags;
struct region_hash *rh = &ms->rh;
struct dirty_log *log = ms->rh.log;
struct region *reg;
region_t region = bio_to_region(rh, bio);
int recovering = 0;
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);
ms->in_sync = 0;
read_lock(&rh->hash_lock);
reg = __rh_find(rh, region);
read_unlock(&rh->hash_lock);
/* region hash entry should exist because write was in-flight */
BUG_ON(!reg);
BUG_ON(!list_empty(&reg->list));
spin_lock_irqsave(&rh->region_lock, flags);
/*
* Possible cases:
* 1) RH_DIRTY
* 2) RH_NOSYNC: was dirty, other preceeding writes failed
* 3) RH_RECOVERING: flushing pending writes
* Either case, the region should have not been connected to list.
*/
recovering = (reg->state == RH_RECOVERING);
reg->state = RH_NOSYNC;
BUG_ON(!list_empty(&reg->list));
spin_unlock_irqrestore(&rh->region_lock, flags);
bio_endio(bio, error);
if (recovering)
complete_resync_work(reg, 0);
}
static void write_callback(unsigned long error, void *context) static void write_callback(unsigned long error, void *context)
{ {
unsigned int i; unsigned i, ret = 0;
int uptodate = 1;
struct bio *bio = (struct bio *) context; struct bio *bio = (struct bio *) context;
struct mirror_set *ms; struct mirror_set *ms;
int uptodate = 0;
int should_wake = 0;
unsigned long flags;
ms = bio_get_ms(bio); ms = bio_get_m(bio)->ms;
bio_set_ms(bio, NULL); bio_set_m(bio, NULL);
/* /*
* NOTE: We don't decrement the pending count here, * NOTE: We don't decrement the pending count here,
@ -809,26 +1072,42 @@ static void write_callback(unsigned long error, void *context)
* This way we handle both writes to SYNC and NOSYNC * This way we handle both writes to SYNC and NOSYNC
* regions with the same code. * regions with the same code.
*/ */
if (likely(!error))
goto out;
if (error) { for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
else
uptodate = 1;
if (unlikely(!uptodate)) {
DMERR("All replicated volumes dead, failing I/O");
/* None of the writes succeeded, fail the I/O. */
ret = -EIO;
} else if (errors_handled(ms)) {
/* /*
* only error the io if all mirrors failed. * Need to raise event. Since raising
* FIXME: bogus * events can block, we need to do it in
* the main thread.
*/ */
uptodate = 0; spin_lock_irqsave(&ms->lock, flags);
for (i = 0; i < ms->nr_mirrors; i++) if (!ms->failures.head)
if (!test_bit(i, &error)) { should_wake = 1;
uptodate = 1; bio_list_add(&ms->failures, bio);
break; spin_unlock_irqrestore(&ms->lock, flags);
} if (should_wake)
wake(ms);
return;
} }
bio_endio(bio, 0); out:
bio_endio(bio, ret);
} }
static void do_write(struct mirror_set *ms, struct bio *bio) static void do_write(struct mirror_set *ms, struct bio *bio)
{ {
unsigned int i; unsigned int i;
struct io_region io[KCOPYD_MAX_REGIONS+1]; struct io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m; struct mirror *m;
struct dm_io_request io_req = { struct dm_io_request io_req = {
.bi_rw = WRITE, .bi_rw = WRITE,
@ -839,15 +1118,14 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
.client = ms->io_client, .client = ms->io_client,
}; };
for (i = 0; i < ms->nr_mirrors; i++) { for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
m = ms->mirror + i; map_region(dest++, m, bio);
io[i].bdev = m->dev->bdev; /*
io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); * Use default mirror because we only need it to retrieve the reference
io[i].count = bio->bi_size >> 9; * to the mirror set in write_callback().
} */
bio_set_m(bio, get_default_mirror(ms));
bio_set_ms(bio, ms);
(void) dm_io(&io_req, ms->nr_mirrors, io, NULL); (void) dm_io(&io_req, ms->nr_mirrors, io, NULL);
} }
@ -900,43 +1178,125 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
/* /*
* Dispatch io. * Dispatch io.
*/ */
if (unlikely(ms->log_failure)) if (unlikely(ms->log_failure)) {
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->failures, &sync);
spin_unlock_irq(&ms->lock);
} else
while ((bio = bio_list_pop(&sync))) while ((bio = bio_list_pop(&sync)))
bio_endio(bio, -EIO); do_write(ms, bio);
else while ((bio = bio_list_pop(&sync)))
do_write(ms, bio);
while ((bio = bio_list_pop(&recover))) while ((bio = bio_list_pop(&recover)))
rh_delay(&ms->rh, bio); rh_delay(&ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) { while ((bio = bio_list_pop(&nosync))) {
map_bio(ms, ms->default_mirror, bio); map_bio(get_default_mirror(ms), bio);
generic_make_request(bio); generic_make_request(bio);
} }
} }
static void do_failures(struct mirror_set *ms, struct bio_list *failures)
{
struct bio *bio;
if (!failures->head)
return;
if (!ms->log_failure) {
while ((bio = bio_list_pop(failures)))
__bio_mark_nosync(ms, bio, bio->bi_size, 0);
return;
}
/*
* If the log has failed, unattempted writes are being
* put on the failures list. We can't issue those writes
* until a log has been marked, so we must store them.
*
* If a 'noflush' suspend is in progress, we can requeue
* the I/O's to the core. This give userspace a chance
* to reconfigure the mirror, at which point the core
* will reissue the writes. If the 'noflush' flag is
* not set, we have no choice but to return errors.
*
* Some writes on the failures list may have been
* submitted before the log failure and represent a
* failure to write to one of the devices. It is ok
* for us to treat them the same and requeue them
* as well.
*/
if (dm_noflush_suspending(ms->ti)) {
while ((bio = bio_list_pop(failures)))
bio_endio(bio, DM_ENDIO_REQUEUE);
return;
}
if (atomic_read(&ms->suspend)) {
while ((bio = bio_list_pop(failures)))
bio_endio(bio, -EIO);
return;
}
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->failures, failures);
spin_unlock_irq(&ms->lock);
wake(ms);
}
static void trigger_event(struct work_struct *work)
{
struct mirror_set *ms =
container_of(work, struct mirror_set, trigger_event);
dm_table_event(ms->ti->table);
}
/*----------------------------------------------------------------- /*-----------------------------------------------------------------
* kmirrord * kmirrord
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
static void do_mirror(struct work_struct *work) static int _do_mirror(struct work_struct *work)
{ {
struct mirror_set *ms =container_of(work, struct mirror_set, struct mirror_set *ms =container_of(work, struct mirror_set,
kmirrord_work); kmirrord_work);
struct bio_list reads, writes; struct bio_list reads, writes, failures;
unsigned long flags;
spin_lock(&ms->lock); spin_lock_irqsave(&ms->lock, flags);
reads = ms->reads; reads = ms->reads;
writes = ms->writes; writes = ms->writes;
failures = ms->failures;
bio_list_init(&ms->reads); bio_list_init(&ms->reads);
bio_list_init(&ms->writes); bio_list_init(&ms->writes);
spin_unlock(&ms->lock); bio_list_init(&ms->failures);
spin_unlock_irqrestore(&ms->lock, flags);
rh_update_states(&ms->rh); rh_update_states(&ms->rh);
do_recovery(ms); do_recovery(ms);
do_reads(ms, &reads); do_reads(ms, &reads);
do_writes(ms, &writes); do_writes(ms, &writes);
do_failures(ms, &failures);
return (ms->failures.head) ? 1 : 0;
} }
static void do_mirror(struct work_struct *work)
{
/*
* If _do_mirror returns 1, we give it
* another shot. This helps for cases like
* 'suspend' where we call flush_workqueue
* and expect all work to be finished. If
* a failure happens during a suspend, we
* couldn't issue a 'wake' because it would
* not be honored. Therefore, we return '1'
* from _do_mirror, and retry here.
*/
while (_do_mirror(work))
schedule();
}
/*----------------------------------------------------------------- /*-----------------------------------------------------------------
* Target functions * Target functions
*---------------------------------------------------------------*/ *---------------------------------------------------------------*/
@ -965,11 +1325,23 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
ms->nr_mirrors = nr_mirrors; ms->nr_mirrors = nr_mirrors;
ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->nr_regions = dm_sector_div_up(ti->len, region_size);
ms->in_sync = 0; ms->in_sync = 0;
ms->default_mirror = &ms->mirror[DEFAULT_MIRROR]; ms->log_failure = 0;
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
len = sizeof(struct dm_raid1_read_record);
ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS,
len);
if (!ms->read_record_pool) {
ti->error = "Error creating mirror read_record_pool";
kfree(ms);
return NULL;
}
ms->io_client = dm_io_client_create(DM_IO_PAGES); ms->io_client = dm_io_client_create(DM_IO_PAGES);
if (IS_ERR(ms->io_client)) { if (IS_ERR(ms->io_client)) {
ti->error = "Error creating dm_io client"; ti->error = "Error creating dm_io client";
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
return NULL; return NULL;
} }
@ -977,6 +1349,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
ti->error = "Error creating dirty region hash"; ti->error = "Error creating dirty region hash";
dm_io_client_destroy(ms->io_client); dm_io_client_destroy(ms->io_client);
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
return NULL; return NULL;
} }
@ -992,6 +1365,7 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti,
dm_io_client_destroy(ms->io_client); dm_io_client_destroy(ms->io_client);
rh_exit(&ms->rh); rh_exit(&ms->rh);
mempool_destroy(ms->read_record_pool);
kfree(ms); kfree(ms);
} }
@ -1019,6 +1393,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
} }
ms->mirror[mirror].ms = ms; ms->mirror[mirror].ms = ms;
atomic_set(&(ms->mirror[mirror].error_count), 0);
ms->mirror[mirror].error_type = 0;
ms->mirror[mirror].offset = offset; ms->mirror[mirror].offset = offset;
return 0; return 0;
@ -1171,6 +1547,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto err_free_context; goto err_free_context;
} }
INIT_WORK(&ms->kmirrord_work, do_mirror); INIT_WORK(&ms->kmirrord_work, do_mirror);
INIT_WORK(&ms->trigger_event, trigger_event);
r = parse_features(ms, argc, argv, &args_used); r = parse_features(ms, argc, argv, &args_used);
if (r) if (r)
@ -1220,14 +1597,15 @@ static void mirror_dtr(struct dm_target *ti)
static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
{ {
unsigned long flags;
int should_wake = 0; int should_wake = 0;
struct bio_list *bl; struct bio_list *bl;
bl = (rw == WRITE) ? &ms->writes : &ms->reads; bl = (rw == WRITE) ? &ms->writes : &ms->reads;
spin_lock(&ms->lock); spin_lock_irqsave(&ms->lock, flags);
should_wake = !(bl->head); should_wake = !(bl->head);
bio_list_add(bl, bio); bio_list_add(bl, bio);
spin_unlock(&ms->lock); spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake) if (should_wake)
wake(ms); wake(ms);
@ -1242,10 +1620,11 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
int r, rw = bio_rw(bio); int r, rw = bio_rw(bio);
struct mirror *m; struct mirror *m;
struct mirror_set *ms = ti->private; struct mirror_set *ms = ti->private;
struct dm_raid1_read_record *read_record = NULL;
map_context->ll = bio_to_region(&ms->rh, bio);
if (rw == WRITE) { if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
map_context->ll = bio_to_region(&ms->rh, bio);
queue_bio(ms, bio, rw); queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
@ -1255,28 +1634,34 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
if (r < 0 && r != -EWOULDBLOCK) if (r < 0 && r != -EWOULDBLOCK)
return r; return r;
if (r == -EWOULDBLOCK) /* FIXME: ugly */
r = DM_MAPIO_SUBMITTED;
/* /*
* We don't want to fast track a recovery just for a read * If region is not in-sync queue the bio.
* ahead. So we just let it silently fail.
* FIXME: get rid of this.
*/ */
if (!r && rw == READA) if (!r || (r == -EWOULDBLOCK)) {
return -EIO; if (rw == READA)
return -EWOULDBLOCK;
if (!r) {
/* Pass this io over to the daemon */
queue_bio(ms, bio, rw); queue_bio(ms, bio, rw);
return DM_MAPIO_SUBMITTED; return DM_MAPIO_SUBMITTED;
} }
/*
* The region is in-sync and we can perform reads directly.
* Store enough information so we can retry if it fails.
*/
m = choose_mirror(ms, bio->bi_sector); m = choose_mirror(ms, bio->bi_sector);
if (!m) if (unlikely(!m))
return -EIO; return -EIO;
map_bio(ms, m, bio); read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
if (likely(read_record)) {
dm_bio_record(&read_record->details, bio);
map_context->ptr = read_record;
read_record->m = m;
}
map_bio(m, bio);
return DM_MAPIO_REMAPPED; return DM_MAPIO_REMAPPED;
} }
@ -1285,71 +1670,173 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
{ {
int rw = bio_rw(bio); int rw = bio_rw(bio);
struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror_set *ms = (struct mirror_set *) ti->private;
region_t region = map_context->ll; struct mirror *m = NULL;
struct dm_bio_details *bd = NULL;
struct dm_raid1_read_record *read_record = map_context->ptr;
/* /*
* We need to dec pending if this was a write. * We need to dec pending if this was a write.
*/ */
if (rw == WRITE) if (rw == WRITE) {
rh_dec(&ms->rh, region); rh_dec(&ms->rh, map_context->ll);
return error;
}
return 0; if (error == -EOPNOTSUPP)
goto out;
if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
goto out;
if (unlikely(error)) {
if (!read_record) {
/*
* There wasn't enough memory to record necessary
* information for a retry or there was no other
* mirror in-sync.
*/
DMERR_LIMIT("Mirror read failed from %s.",
m->dev->name);
return -EIO;
}
DMERR("Mirror read failed from %s. Trying alternative device.",
m->dev->name);
m = read_record->m;
fail_mirror(m, DM_RAID1_READ_ERROR);
/*
* A failed read is requeued for another attempt using an intact
* mirror.
*/
if (default_ok(m) || mirror_available(ms, bio)) {
bd = &read_record->details;
dm_bio_restore(bd, bio);
mempool_free(read_record, ms->read_record_pool);
map_context->ptr = NULL;
queue_bio(ms, bio, rw);
return 1;
}
DMERR("All replicated volumes dead, failing I/O");
}
out:
if (read_record) {
mempool_free(read_record, ms->read_record_pool);
map_context->ptr = NULL;
}
return error;
}
static void mirror_presuspend(struct dm_target *ti)
{
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dirty_log *log = ms->rh.log;
atomic_set(&ms->suspend, 1);
/*
* We must finish up all the work that we've
* generated (i.e. recovery work).
*/
rh_stop_recovery(&ms->rh);
wait_event(_kmirrord_recovery_stopped,
!atomic_read(&ms->rh.recovery_in_flight));
if (log->type->presuspend && log->type->presuspend(log))
/* FIXME: need better error handling */
DMWARN("log presuspend failed");
/*
* Now that recovery is complete/stopped and the
* delayed bios are queued, we need to wait for
* the worker thread to complete. This way,
* we know that all of our I/O has been pushed.
*/
flush_workqueue(ms->kmirrord_wq);
} }
static void mirror_postsuspend(struct dm_target *ti) static void mirror_postsuspend(struct dm_target *ti)
{ {
struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror_set *ms = ti->private;
struct dirty_log *log = ms->rh.log; struct dirty_log *log = ms->rh.log;
rh_stop_recovery(&ms->rh);
/* Wait for all I/O we generated to complete */
wait_event(_kmirrord_recovery_stopped,
!atomic_read(&ms->rh.recovery_in_flight));
if (log->type->postsuspend && log->type->postsuspend(log)) if (log->type->postsuspend && log->type->postsuspend(log))
/* FIXME: need better error handling */ /* FIXME: need better error handling */
DMWARN("log suspend failed"); DMWARN("log postsuspend failed");
} }
static void mirror_resume(struct dm_target *ti) static void mirror_resume(struct dm_target *ti)
{ {
struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror_set *ms = ti->private;
struct dirty_log *log = ms->rh.log; struct dirty_log *log = ms->rh.log;
atomic_set(&ms->suspend, 0);
if (log->type->resume && log->type->resume(log)) if (log->type->resume && log->type->resume(log))
/* FIXME: need better error handling */ /* FIXME: need better error handling */
DMWARN("log resume failed"); DMWARN("log resume failed");
rh_start_recovery(&ms->rh); rh_start_recovery(&ms->rh);
} }
/*
* device_status_char
* @m: mirror device/leg we want the status of
*
* We return one character representing the most severe error
* we have encountered.
* A => Alive - No failures
* D => Dead - A write failure occurred leaving mirror out-of-sync
* S => Sync - A sychronization failure occurred, mirror out-of-sync
* R => Read - A read failure occurred, mirror data unaffected
*
* Returns: <char>
*/
static char device_status_char(struct mirror *m)
{
if (!atomic_read(&(m->error_count)))
return 'A';
return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
}
static int mirror_status(struct dm_target *ti, status_type_t type, static int mirror_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen) char *result, unsigned int maxlen)
{ {
unsigned int m, sz = 0; unsigned int m, sz = 0;
struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dirty_log *log = ms->rh.log;
char buffer[ms->nr_mirrors + 1];
switch (type) { switch (type) {
case STATUSTYPE_INFO: case STATUSTYPE_INFO:
DMEMIT("%d ", ms->nr_mirrors); DMEMIT("%d ", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++) for (m = 0; m < ms->nr_mirrors; m++) {
DMEMIT("%s ", ms->mirror[m].dev->name); DMEMIT("%s ", ms->mirror[m].dev->name);
buffer[m] = device_status_char(&(ms->mirror[m]));
}
buffer[m] = '\0';
DMEMIT("%llu/%llu 0 ", DMEMIT("%llu/%llu 1 %s ",
(unsigned long long)ms->rh.log->type-> (unsigned long long)log->type->get_sync_count(ms->rh.log),
get_sync_count(ms->rh.log), (unsigned long long)ms->nr_regions, buffer);
(unsigned long long)ms->nr_regions);
sz += ms->rh.log->type->status(ms->rh.log, type, result+sz, maxlen-sz); sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz);
break; break;
case STATUSTYPE_TABLE: case STATUSTYPE_TABLE:
sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen); sz = log->type->status(ms->rh.log, type, result, maxlen);
DMEMIT("%d", ms->nr_mirrors); DMEMIT("%d", ms->nr_mirrors);
for (m = 0; m < ms->nr_mirrors; m++) for (m = 0; m < ms->nr_mirrors; m++)
DMEMIT(" %s %llu", ms->mirror[m].dev->name, DMEMIT(" %s %llu", ms->mirror[m].dev->name,
(unsigned long long)ms->mirror[m].offset); (unsigned long long)ms->mirror[m].offset);
if (ms->features & DM_RAID1_HANDLE_ERRORS) if (ms->features & DM_RAID1_HANDLE_ERRORS)
DMEMIT(" 1 handle_errors"); DMEMIT(" 1 handle_errors");
@ -1360,12 +1847,13 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
static struct target_type mirror_target = { static struct target_type mirror_target = {
.name = "mirror", .name = "mirror",
.version = {1, 0, 3}, .version = {1, 0, 20},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = mirror_ctr, .ctr = mirror_ctr,
.dtr = mirror_dtr, .dtr = mirror_dtr,
.map = mirror_map, .map = mirror_map,
.end_io = mirror_end_io, .end_io = mirror_end_io,
.presuspend = mirror_presuspend,
.postsuspend = mirror_postsuspend, .postsuspend = mirror_postsuspend,
.resume = mirror_resume, .resume = mirror_resume,
.status = mirror_status, .status = mirror_status,

View File

@ -213,11 +213,15 @@ static void unregister_snapshot(struct dm_snapshot *s)
/* /*
* Implementation of the exception hash tables. * Implementation of the exception hash tables.
* The lowest hash_shift bits of the chunk number are ignored, allowing
* some consecutive chunks to be grouped together.
*/ */
static int init_exception_table(struct exception_table *et, uint32_t size) static int init_exception_table(struct exception_table *et, uint32_t size,
unsigned hash_shift)
{ {
unsigned int i; unsigned int i;
et->hash_shift = hash_shift;
et->hash_mask = size - 1; et->hash_mask = size - 1;
et->table = dm_vcalloc(size, sizeof(struct list_head)); et->table = dm_vcalloc(size, sizeof(struct list_head));
if (!et->table) if (!et->table)
@ -248,7 +252,7 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache *
static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) static uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
{ {
return chunk & et->hash_mask; return (chunk >> et->hash_shift) & et->hash_mask;
} }
static void insert_exception(struct exception_table *eh, static void insert_exception(struct exception_table *eh,
@ -275,7 +279,8 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et,
slot = &et->table[exception_hash(et, chunk)]; slot = &et->table[exception_hash(et, chunk)];
list_for_each_entry (e, slot, hash_list) list_for_each_entry (e, slot, hash_list)
if (e->old_chunk == chunk) if (chunk >= e->old_chunk &&
chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
return e; return e;
return NULL; return NULL;
@ -307,6 +312,49 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
mempool_free(pe, pending_pool); mempool_free(pe, pending_pool);
} }
static void insert_completed_exception(struct dm_snapshot *s,
struct dm_snap_exception *new_e)
{
struct exception_table *eh = &s->complete;
struct list_head *l;
struct dm_snap_exception *e = NULL;
l = &eh->table[exception_hash(eh, new_e->old_chunk)];
/* Add immediately if this table doesn't support consecutive chunks */
if (!eh->hash_shift)
goto out;
/* List is ordered by old_chunk */
list_for_each_entry_reverse(e, l, hash_list) {
/* Insert after an existing chunk? */
if (new_e->old_chunk == (e->old_chunk +
dm_consecutive_chunk_count(e) + 1) &&
new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
dm_consecutive_chunk_count(e) + 1)) {
dm_consecutive_chunk_count_inc(e);
free_exception(new_e);
return;
}
/* Insert before an existing chunk? */
if (new_e->old_chunk == (e->old_chunk - 1) &&
new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
dm_consecutive_chunk_count_inc(e);
e->old_chunk--;
e->new_chunk--;
free_exception(new_e);
return;
}
if (new_e->old_chunk > e->old_chunk)
break;
}
out:
list_add(&new_e->hash_list, e ? &e->hash_list : l);
}
int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
{ {
struct dm_snap_exception *e; struct dm_snap_exception *e;
@ -316,8 +364,12 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
return -ENOMEM; return -ENOMEM;
e->old_chunk = old; e->old_chunk = old;
/* Consecutive_count is implicitly initialised to zero */
e->new_chunk = new; e->new_chunk = new;
insert_exception(&s->complete, e);
insert_completed_exception(s, e);
return 0; return 0;
} }
@ -333,16 +385,6 @@ static int calc_max_buckets(void)
return mem; return mem;
} }
/*
* Rounds a number down to a power of 2.
*/
static uint32_t round_down(uint32_t n)
{
while (n & (n - 1))
n &= (n - 1);
return n;
}
/* /*
* Allocate room for a suitable hash table. * Allocate room for a suitable hash table.
*/ */
@ -361,9 +403,9 @@ static int init_hash_tables(struct dm_snapshot *s)
hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
hash_size = min(hash_size, max_buckets); hash_size = min(hash_size, max_buckets);
/* Round it down to a power of 2 */ hash_size = rounddown_pow_of_two(hash_size);
hash_size = round_down(hash_size); if (init_exception_table(&s->complete, hash_size,
if (init_exception_table(&s->complete, hash_size)) DM_CHUNK_CONSECUTIVE_BITS))
return -ENOMEM; return -ENOMEM;
/* /*
@ -374,7 +416,7 @@ static int init_hash_tables(struct dm_snapshot *s)
if (hash_size < 64) if (hash_size < 64)
hash_size = 64; hash_size = 64;
if (init_exception_table(&s->pending, hash_size)) { if (init_exception_table(&s->pending, hash_size, 0)) {
exit_exception_table(&s->complete, exception_cache); exit_exception_table(&s->complete, exception_cache);
return -ENOMEM; return -ENOMEM;
} }
@ -733,7 +775,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
* Add a proper exception, and remove the * Add a proper exception, and remove the
* in-flight exception from the list. * in-flight exception from the list.
*/ */
insert_exception(&s->complete, e); insert_completed_exception(s, e);
out: out:
remove_exception(&pe->e); remove_exception(&pe->e);
@ -867,11 +909,12 @@ __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
} }
static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e,
struct bio *bio) struct bio *bio, chunk_t chunk)
{ {
bio->bi_bdev = s->cow->bdev; bio->bi_bdev = s->cow->bdev;
bio->bi_sector = chunk_to_sector(s, e->new_chunk) + bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) +
(bio->bi_sector & s->chunk_mask); (chunk - e->old_chunk)) +
(bio->bi_sector & s->chunk_mask);
} }
static int snapshot_map(struct dm_target *ti, struct bio *bio, static int snapshot_map(struct dm_target *ti, struct bio *bio,
@ -902,7 +945,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
/* If the block is already remapped - use that, else remap it */ /* If the block is already remapped - use that, else remap it */
e = lookup_exception(&s->complete, chunk); e = lookup_exception(&s->complete, chunk);
if (e) { if (e) {
remap_exception(s, e, bio); remap_exception(s, e, bio, chunk);
goto out_unlock; goto out_unlock;
} }
@ -919,7 +962,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
goto out_unlock; goto out_unlock;
} }
remap_exception(s, &pe->e, bio); remap_exception(s, &pe->e, bio, chunk);
bio_list_add(&pe->snapshot_bios, bio); bio_list_add(&pe->snapshot_bios, bio);
r = DM_MAPIO_SUBMITTED; r = DM_MAPIO_SUBMITTED;
@ -1207,7 +1250,7 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result,
static struct target_type origin_target = { static struct target_type origin_target = {
.name = "snapshot-origin", .name = "snapshot-origin",
.version = {1, 5, 0}, .version = {1, 6, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = origin_ctr, .ctr = origin_ctr,
.dtr = origin_dtr, .dtr = origin_dtr,
@ -1218,7 +1261,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = { static struct target_type snapshot_target = {
.name = "snapshot", .name = "snapshot",
.version = {1, 5, 0}, .version = {1, 6, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = snapshot_ctr, .ctr = snapshot_ctr,
.dtr = snapshot_dtr, .dtr = snapshot_dtr,

View File

@ -16,19 +16,22 @@
struct exception_table { struct exception_table {
uint32_t hash_mask; uint32_t hash_mask;
unsigned hash_shift;
struct list_head *table; struct list_head *table;
}; };
/* /*
* The snapshot code deals with largish chunks of the disk at a * The snapshot code deals with largish chunks of the disk at a
* time. Typically 64k - 256k. * time. Typically 32k - 512k.
*/ */
/* FIXME: can we get away with limiting these to a uint32_t ? */
typedef sector_t chunk_t; typedef sector_t chunk_t;
/* /*
* An exception is used where an old chunk of data has been * An exception is used where an old chunk of data has been
* replaced by a new one. * replaced by a new one.
* If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number
* of chunks that follow contiguously. Remaining bits hold the number of the
* chunk within the device.
*/ */
struct dm_snap_exception { struct dm_snap_exception {
struct list_head hash_list; struct list_head hash_list;
@ -37,6 +40,49 @@ struct dm_snap_exception {
chunk_t new_chunk; chunk_t new_chunk;
}; };
/*
* Funtions to manipulate consecutive chunks
*/
# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
# define DM_CHUNK_CONSECUTIVE_BITS 8
# define DM_CHUNK_NUMBER_BITS 56
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
{
return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
{
e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
BUG_ON(!dm_consecutive_chunk_count(e));
}
# else
# define DM_CHUNK_CONSECUTIVE_BITS 0
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
return chunk;
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
{
return 0;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
{
}
# endif
/* /*
* Abstraction to handle the meta/layout of exception stores (the * Abstraction to handle the meta/layout of exception stores (the
* COW device). * COW device).

View File

@ -14,10 +14,13 @@
#include <linux/log2.h> #include <linux/log2.h>
#define DM_MSG_PREFIX "striped" #define DM_MSG_PREFIX "striped"
#define DM_IO_ERROR_THRESHOLD 15
struct stripe { struct stripe {
struct dm_dev *dev; struct dm_dev *dev;
sector_t physical_start; sector_t physical_start;
atomic_t error_count;
}; };
struct stripe_c { struct stripe_c {
@ -30,9 +33,29 @@ struct stripe_c {
uint32_t chunk_shift; uint32_t chunk_shift;
sector_t chunk_mask; sector_t chunk_mask;
/* Needed for handling events */
struct dm_target *ti;
/* Work struct used for triggering events*/
struct work_struct kstriped_ws;
struct stripe stripe[0]; struct stripe stripe[0];
}; };
static struct workqueue_struct *kstriped;
/*
* An event is triggered whenever a drive
* drops out of a stripe volume.
*/
static void trigger_event(struct work_struct *work)
{
struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws);
dm_table_event(sc->ti->table);
}
static inline struct stripe_c *alloc_context(unsigned int stripes) static inline struct stripe_c *alloc_context(unsigned int stripes)
{ {
size_t len; size_t len;
@ -63,6 +86,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
return -ENXIO; return -ENXIO;
sc->stripe[stripe].physical_start = start; sc->stripe[stripe].physical_start = start;
return 0; return 0;
} }
@ -135,6 +159,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM; return -ENOMEM;
} }
INIT_WORK(&sc->kstriped_ws, trigger_event);
/* Set pointer to dm target; used in trigger_event */
sc->ti = ti;
sc->stripes = stripes; sc->stripes = stripes;
sc->stripe_width = width; sc->stripe_width = width;
ti->split_io = chunk_size; ti->split_io = chunk_size;
@ -158,9 +187,11 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
kfree(sc); kfree(sc);
return r; return r;
} }
atomic_set(&(sc->stripe[i].error_count), 0);
} }
ti->private = sc; ti->private = sc;
return 0; return 0;
} }
@ -172,6 +203,7 @@ static void stripe_dtr(struct dm_target *ti)
for (i = 0; i < sc->stripes; i++) for (i = 0; i < sc->stripes; i++)
dm_put_device(ti, sc->stripe[i].dev); dm_put_device(ti, sc->stripe[i].dev);
flush_workqueue(kstriped);
kfree(sc); kfree(sc);
} }
@ -190,16 +222,37 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
return DM_MAPIO_REMAPPED; return DM_MAPIO_REMAPPED;
} }
/*
* Stripe status:
*
* INFO
* #stripes [stripe_name <stripe_name>] [group word count]
* [error count 'A|D' <error count 'A|D'>]
*
* TABLE
* #stripes [stripe chunk size]
* [stripe_name physical_start <stripe_name physical_start>]
*
*/
static int stripe_status(struct dm_target *ti, static int stripe_status(struct dm_target *ti,
status_type_t type, char *result, unsigned int maxlen) status_type_t type, char *result, unsigned int maxlen)
{ {
struct stripe_c *sc = (struct stripe_c *) ti->private; struct stripe_c *sc = (struct stripe_c *) ti->private;
char buffer[sc->stripes + 1];
unsigned int sz = 0; unsigned int sz = 0;
unsigned int i; unsigned int i;
switch (type) { switch (type) {
case STATUSTYPE_INFO: case STATUSTYPE_INFO:
result[0] = '\0'; DMEMIT("%d ", sc->stripes);
for (i = 0; i < sc->stripes; i++) {
DMEMIT("%s ", sc->stripe[i].dev->name);
buffer[i] = atomic_read(&(sc->stripe[i].error_count)) ?
'D' : 'A';
}
buffer[i] = '\0';
DMEMIT("1 %s", buffer);
break; break;
case STATUSTYPE_TABLE: case STATUSTYPE_TABLE:
@ -213,13 +266,52 @@ static int stripe_status(struct dm_target *ti,
return 0; return 0;
} }
static int stripe_end_io(struct dm_target *ti, struct bio *bio,
int error, union map_info *map_context)
{
unsigned i;
char major_minor[16];
struct stripe_c *sc = ti->private;
if (!error)
return 0; /* I/O complete */
if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
return error;
if (error == -EOPNOTSUPP)
return error;
memset(major_minor, 0, sizeof(major_minor));
sprintf(major_minor, "%d:%d",
bio->bi_bdev->bd_disk->major,
bio->bi_bdev->bd_disk->first_minor);
/*
* Test to see which stripe drive triggered the event
* and increment error count for all stripes on that device.
* If the error count for a given device exceeds the threshold
* value we will no longer trigger any further events.
*/
for (i = 0; i < sc->stripes; i++)
if (!strcmp(sc->stripe[i].dev->name, major_minor)) {
atomic_inc(&(sc->stripe[i].error_count));
if (atomic_read(&(sc->stripe[i].error_count)) <
DM_IO_ERROR_THRESHOLD)
queue_work(kstriped, &sc->kstriped_ws);
}
return error;
}
static struct target_type stripe_target = { static struct target_type stripe_target = {
.name = "striped", .name = "striped",
.version= {1, 0, 2}, .version = {1, 1, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = stripe_ctr, .ctr = stripe_ctr,
.dtr = stripe_dtr, .dtr = stripe_dtr,
.map = stripe_map, .map = stripe_map,
.end_io = stripe_end_io,
.status = stripe_status, .status = stripe_status,
}; };
@ -231,6 +323,13 @@ int __init dm_stripe_init(void)
if (r < 0) if (r < 0)
DMWARN("target registration failed"); DMWARN("target registration failed");
kstriped = create_singlethread_workqueue("kstriped");
if (!kstriped) {
DMERR("failed to create workqueue kstriped");
dm_unregister_target(&stripe_target);
return -ENOMEM;
}
return r; return r;
} }
@ -239,5 +338,7 @@ void dm_stripe_exit(void)
if (dm_unregister_target(&stripe_target)) if (dm_unregister_target(&stripe_target))
DMWARN("target unregistration failed"); DMWARN("target unregistration failed");
destroy_workqueue(kstriped);
return; return;
} }

View File

@ -287,9 +287,8 @@ static void free_devices(struct list_head *devices)
{ {
struct list_head *tmp, *next; struct list_head *tmp, *next;
for (tmp = devices->next; tmp != devices; tmp = next) { list_for_each_safe(tmp, next, devices) {
struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
next = tmp->next;
kfree(dd); kfree(dd);
} }
} }
@ -476,7 +475,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
int mode, struct dm_dev **result) int mode, struct dm_dev **result)
{ {
int r; int r;
dev_t dev; dev_t uninitialized_var(dev);
struct dm_dev *dd; struct dm_dev *dd;
unsigned int major, minor; unsigned int major, minor;
@ -805,7 +804,7 @@ static int setup_indexes(struct dm_table *t)
return -ENOMEM; return -ENOMEM;
/* set up internal nodes, bottom-up */ /* set up internal nodes, bottom-up */
for (i = t->depth - 2, total = 0; i >= 0; i--) { for (i = t->depth - 2; i >= 0; i--) {
t->index[i] = indexes; t->index[i] = indexes;
indexes += (KEYS_PER_NODE * t->counts[i]); indexes += (KEYS_PER_NODE * t->counts[i]);
setup_btree_index(i, t); setup_btree_index(i, t);
@ -993,12 +992,11 @@ int dm_table_resume_targets(struct dm_table *t)
int dm_table_any_congested(struct dm_table *t, int bdi_bits) int dm_table_any_congested(struct dm_table *t, int bdi_bits)
{ {
struct list_head *d, *devices; struct dm_dev *dd;
struct list_head *devices = dm_table_get_devices(t);
int r = 0; int r = 0;
devices = dm_table_get_devices(t); list_for_each_entry(dd, devices, list) {
for (d = devices->next; d != devices; d = d->next) {
struct dm_dev *dd = list_entry(d, struct dm_dev, list);
struct request_queue *q = bdev_get_queue(dd->bdev); struct request_queue *q = bdev_get_queue(dd->bdev);
r |= bdi_congested(&q->backing_dev_info, bdi_bits); r |= bdi_congested(&q->backing_dev_info, bdi_bits);
} }
@ -1008,10 +1006,10 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
void dm_table_unplug_all(struct dm_table *t) void dm_table_unplug_all(struct dm_table *t)
{ {
struct list_head *d, *devices = dm_table_get_devices(t); struct dm_dev *dd;
struct list_head *devices = dm_table_get_devices(t);
for (d = devices->next; d != devices; d = d->next) { list_for_each_entry(dd, devices, list) {
struct dm_dev *dd = list_entry(d, struct dm_dev, list);
struct request_queue *q = bdev_get_queue(dd->bdev); struct request_queue *q = bdev_get_queue(dd->bdev);
blk_unplug(q); blk_unplug(q);

View File

@ -71,9 +71,22 @@ union map_info *dm_get_mapinfo(struct bio *bio)
#define DMF_DELETING 4 #define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5 #define DMF_NOFLUSH_SUSPENDING 5
/*
* Work processed by per-device workqueue.
*/
struct dm_wq_req {
enum {
DM_WQ_FLUSH_ALL,
DM_WQ_FLUSH_DEFERRED,
} type;
struct work_struct work;
struct mapped_device *md;
void *context;
};
struct mapped_device { struct mapped_device {
struct rw_semaphore io_lock; struct rw_semaphore io_lock;
struct semaphore suspend_lock; struct mutex suspend_lock;
spinlock_t pushback_lock; spinlock_t pushback_lock;
rwlock_t map_lock; rwlock_t map_lock;
atomic_t holders; atomic_t holders;
@ -95,6 +108,11 @@ struct mapped_device {
struct bio_list deferred; struct bio_list deferred;
struct bio_list pushback; struct bio_list pushback;
/*
* Processing queue (flush/barriers)
*/
struct workqueue_struct *wq;
/* /*
* The current mapping. * The current mapping.
*/ */
@ -181,7 +199,7 @@ static void local_exit(void)
DMINFO("cleaned up"); DMINFO("cleaned up");
} }
int (*_inits[])(void) __initdata = { static int (*_inits[])(void) __initdata = {
local_init, local_init,
dm_target_init, dm_target_init,
dm_linear_init, dm_linear_init,
@ -189,7 +207,7 @@ int (*_inits[])(void) __initdata = {
dm_interface_init, dm_interface_init,
}; };
void (*_exits[])(void) = { static void (*_exits[])(void) = {
local_exit, local_exit,
dm_target_exit, dm_target_exit,
dm_linear_exit, dm_linear_exit,
@ -982,7 +1000,7 @@ static struct mapped_device *alloc_dev(int minor)
} }
if (!try_module_get(THIS_MODULE)) if (!try_module_get(THIS_MODULE))
goto bad0; goto bad_module_get;
/* get a minor number for the dev */ /* get a minor number for the dev */
if (minor == DM_ANY_MINOR) if (minor == DM_ANY_MINOR)
@ -990,11 +1008,11 @@ static struct mapped_device *alloc_dev(int minor)
else else
r = specific_minor(md, minor); r = specific_minor(md, minor);
if (r < 0) if (r < 0)
goto bad1; goto bad_minor;
memset(md, 0, sizeof(*md)); memset(md, 0, sizeof(*md));
init_rwsem(&md->io_lock); init_rwsem(&md->io_lock);
init_MUTEX(&md->suspend_lock); mutex_init(&md->suspend_lock);
spin_lock_init(&md->pushback_lock); spin_lock_init(&md->pushback_lock);
rwlock_init(&md->map_lock); rwlock_init(&md->map_lock);
atomic_set(&md->holders, 1); atomic_set(&md->holders, 1);
@ -1006,7 +1024,7 @@ static struct mapped_device *alloc_dev(int minor)
md->queue = blk_alloc_queue(GFP_KERNEL); md->queue = blk_alloc_queue(GFP_KERNEL);
if (!md->queue) if (!md->queue)
goto bad1_free_minor; goto bad_queue;
md->queue->queuedata = md; md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_fn = dm_any_congested;
@ -1017,11 +1035,11 @@ static struct mapped_device *alloc_dev(int minor)
md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
if (!md->io_pool) if (!md->io_pool)
goto bad2; goto bad_io_pool;
md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool) if (!md->tio_pool)
goto bad3; goto bad_tio_pool;
md->bs = bioset_create(16, 16); md->bs = bioset_create(16, 16);
if (!md->bs) if (!md->bs)
@ -1029,7 +1047,7 @@ static struct mapped_device *alloc_dev(int minor)
md->disk = alloc_disk(1); md->disk = alloc_disk(1);
if (!md->disk) if (!md->disk)
goto bad4; goto bad_disk;
atomic_set(&md->pending, 0); atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait); init_waitqueue_head(&md->wait);
@ -1044,6 +1062,10 @@ static struct mapped_device *alloc_dev(int minor)
add_disk(md->disk); add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor)); format_dev_t(md->name, MKDEV(_major, minor));
md->wq = create_singlethread_workqueue("kdmflush");
if (!md->wq)
goto bad_thread;
/* Populate the mapping, nobody knows we exist yet */ /* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock); spin_lock(&_minor_lock);
old_md = idr_replace(&_minor_idr, md, minor); old_md = idr_replace(&_minor_idr, md, minor);
@ -1053,19 +1075,21 @@ static struct mapped_device *alloc_dev(int minor)
return md; return md;
bad4: bad_thread:
put_disk(md->disk);
bad_disk:
bioset_free(md->bs); bioset_free(md->bs);
bad_no_bioset: bad_no_bioset:
mempool_destroy(md->tio_pool); mempool_destroy(md->tio_pool);
bad3: bad_tio_pool:
mempool_destroy(md->io_pool); mempool_destroy(md->io_pool);
bad2: bad_io_pool:
blk_cleanup_queue(md->queue); blk_cleanup_queue(md->queue);
bad1_free_minor: bad_queue:
free_minor(minor); free_minor(minor);
bad1: bad_minor:
module_put(THIS_MODULE); module_put(THIS_MODULE);
bad0: bad_module_get:
kfree(md); kfree(md);
return NULL; return NULL;
} }
@ -1080,6 +1104,7 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md); unlock_fs(md);
bdput(md->suspended_bdev); bdput(md->suspended_bdev);
} }
destroy_workqueue(md->wq);
mempool_destroy(md->tio_pool); mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool); mempool_destroy(md->io_pool);
bioset_free(md->bs); bioset_free(md->bs);
@ -1259,20 +1284,91 @@ void dm_put(struct mapped_device *md)
} }
EXPORT_SYMBOL_GPL(dm_put); EXPORT_SYMBOL_GPL(dm_put);
static int dm_wait_for_completion(struct mapped_device *md)
{
int r = 0;
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
smp_mb();
if (!atomic_read(&md->pending))
break;
if (signal_pending(current)) {
r = -EINTR;
break;
}
io_schedule();
}
set_current_state(TASK_RUNNING);
return r;
}
/* /*
* Process the deferred bios * Process the deferred bios
*/ */
static void __flush_deferred_io(struct mapped_device *md, struct bio *c) static void __flush_deferred_io(struct mapped_device *md)
{ {
struct bio *n; struct bio *c;
while (c) { while ((c = bio_list_pop(&md->deferred))) {
n = c->bi_next;
c->bi_next = NULL;
if (__split_bio(md, c)) if (__split_bio(md, c))
bio_io_error(c); bio_io_error(c);
c = n;
} }
clear_bit(DMF_BLOCK_IO, &md->flags);
}
static void __merge_pushback_list(struct mapped_device *md)
{
unsigned long flags;
spin_lock_irqsave(&md->pushback_lock, flags);
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
bio_list_merge_head(&md->deferred, &md->pushback);
bio_list_init(&md->pushback);
spin_unlock_irqrestore(&md->pushback_lock, flags);
}
static void dm_wq_work(struct work_struct *work)
{
struct dm_wq_req *req = container_of(work, struct dm_wq_req, work);
struct mapped_device *md = req->md;
down_write(&md->io_lock);
switch (req->type) {
case DM_WQ_FLUSH_ALL:
__merge_pushback_list(md);
/* pass through */
case DM_WQ_FLUSH_DEFERRED:
__flush_deferred_io(md);
break;
default:
DMERR("dm_wq_work: unrecognised work type %d", req->type);
BUG();
}
up_write(&md->io_lock);
}
static void dm_wq_queue(struct mapped_device *md, int type, void *context,
struct dm_wq_req *req)
{
req->type = type;
req->md = md;
req->context = context;
INIT_WORK(&req->work, dm_wq_work);
queue_work(md->wq, &req->work);
}
static void dm_queue_flush(struct mapped_device *md, int type, void *context)
{
struct dm_wq_req req;
dm_wq_queue(md, type, context, &req);
flush_workqueue(md->wq);
} }
/* /*
@ -1282,7 +1378,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
{ {
int r = -EINVAL; int r = -EINVAL;
down(&md->suspend_lock); mutex_lock(&md->suspend_lock);
/* device must be suspended */ /* device must be suspended */
if (!dm_suspended(md)) if (!dm_suspended(md))
@ -1297,7 +1393,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
r = __bind(md, table); r = __bind(md, table);
out: out:
up(&md->suspend_lock); mutex_unlock(&md->suspend_lock);
return r; return r;
} }
@ -1346,17 +1442,17 @@ static void unlock_fs(struct mapped_device *md)
int dm_suspend(struct mapped_device *md, unsigned suspend_flags) int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
{ {
struct dm_table *map = NULL; struct dm_table *map = NULL;
unsigned long flags;
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
struct bio *def; int r = 0;
int r = -EINVAL;
int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
down(&md->suspend_lock); mutex_lock(&md->suspend_lock);
if (dm_suspended(md)) if (dm_suspended(md)) {
r = -EINVAL;
goto out_unlock; goto out_unlock;
}
map = dm_get_table(md); map = dm_get_table(md);
@ -1378,16 +1474,16 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
r = -ENOMEM; r = -ENOMEM;
goto flush_and_out; goto flush_and_out;
} }
}
/* /*
* Flush I/O to the device. * Flush I/O to the device. noflush supersedes do_lockfs,
* noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. * because lock_fs() needs to flush I/Os.
*/ */
if (do_lockfs && !noflush) { if (do_lockfs) {
r = lock_fs(md); r = lock_fs(md);
if (r) if (r)
goto out; goto out;
}
} }
/* /*
@ -1404,66 +1500,36 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
dm_table_unplug_all(map); dm_table_unplug_all(map);
/* /*
* Then we wait for the already mapped ios to * Wait for the already-mapped ios to complete.
* complete.
*/ */
while (1) { r = dm_wait_for_completion(md);
set_current_state(TASK_INTERRUPTIBLE);
if (!atomic_read(&md->pending) || signal_pending(current))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
down_write(&md->io_lock); down_write(&md->io_lock);
remove_wait_queue(&md->wait, &wait); remove_wait_queue(&md->wait, &wait);
if (noflush) { if (noflush)
spin_lock_irqsave(&md->pushback_lock, flags); __merge_pushback_list(md);
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); up_write(&md->io_lock);
bio_list_merge_head(&md->deferred, &md->pushback);
bio_list_init(&md->pushback);
spin_unlock_irqrestore(&md->pushback_lock, flags);
}
/* were we interrupted ? */ /* were we interrupted ? */
r = -EINTR; if (r < 0) {
if (atomic_read(&md->pending)) { dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
clear_bit(DMF_BLOCK_IO, &md->flags);
def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md); unlock_fs(md);
goto out; /* pushback list is already flushed, so skip flush */ goto out; /* pushback list is already flushed, so skip flush */
} }
up_write(&md->io_lock);
dm_table_postsuspend_targets(map); dm_table_postsuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags); set_bit(DMF_SUSPENDED, &md->flags);
r = 0;
flush_and_out: flush_and_out:
if (r && noflush) { if (r && noflush)
/* /*
* Because there may be already I/Os in the pushback list, * Because there may be already I/Os in the pushback list,
* flush them before return. * flush them before return.
*/ */
down_write(&md->io_lock); dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL);
spin_lock_irqsave(&md->pushback_lock, flags);
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
bio_list_merge_head(&md->deferred, &md->pushback);
bio_list_init(&md->pushback);
spin_unlock_irqrestore(&md->pushback_lock, flags);
def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->io_lock);
}
out: out:
if (r && md->suspended_bdev) { if (r && md->suspended_bdev) {
@ -1474,17 +1540,16 @@ out:
dm_table_put(map); dm_table_put(map);
out_unlock: out_unlock:
up(&md->suspend_lock); mutex_unlock(&md->suspend_lock);
return r; return r;
} }
int dm_resume(struct mapped_device *md) int dm_resume(struct mapped_device *md)
{ {
int r = -EINVAL; int r = -EINVAL;
struct bio *def;
struct dm_table *map = NULL; struct dm_table *map = NULL;
down(&md->suspend_lock); mutex_lock(&md->suspend_lock);
if (!dm_suspended(md)) if (!dm_suspended(md))
goto out; goto out;
@ -1496,12 +1561,7 @@ int dm_resume(struct mapped_device *md)
if (r) if (r)
goto out; goto out;
down_write(&md->io_lock); dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
clear_bit(DMF_BLOCK_IO, &md->flags);
def = bio_list_get(&md->deferred);
__flush_deferred_io(md, def);
up_write(&md->io_lock);
unlock_fs(md); unlock_fs(md);
@ -1520,7 +1580,7 @@ int dm_resume(struct mapped_device *md)
out: out:
dm_table_put(map); dm_table_put(map);
up(&md->suspend_lock); mutex_unlock(&md->suspend_lock);
return r; return r;
} }

View File

@ -78,7 +78,6 @@
#include <linux/mii.h> #include <linux/mii.h>
#include <linux/if_bonding.h> #include <linux/if_bonding.h>
#include <linux/watchdog.h> #include <linux/watchdog.h>
#include <linux/dm-ioctl.h>
#include <linux/soundcard.h> #include <linux/soundcard.h>
#include <linux/lp.h> #include <linux/lp.h>
@ -1993,39 +1992,6 @@ COMPATIBLE_IOCTL(STOP_ARRAY_RO)
COMPATIBLE_IOCTL(RESTART_ARRAY_RW) COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
COMPATIBLE_IOCTL(GET_BITMAP_FILE) COMPATIBLE_IOCTL(GET_BITMAP_FILE)
ULONG_IOCTL(SET_BITMAP_FILE) ULONG_IOCTL(SET_BITMAP_FILE)
/* DM */
COMPATIBLE_IOCTL(DM_VERSION_32)
COMPATIBLE_IOCTL(DM_REMOVE_ALL_32)
COMPATIBLE_IOCTL(DM_LIST_DEVICES_32)
COMPATIBLE_IOCTL(DM_DEV_CREATE_32)
COMPATIBLE_IOCTL(DM_DEV_REMOVE_32)
COMPATIBLE_IOCTL(DM_DEV_RENAME_32)
COMPATIBLE_IOCTL(DM_DEV_SUSPEND_32)
COMPATIBLE_IOCTL(DM_DEV_STATUS_32)
COMPATIBLE_IOCTL(DM_DEV_WAIT_32)
COMPATIBLE_IOCTL(DM_TABLE_LOAD_32)
COMPATIBLE_IOCTL(DM_TABLE_CLEAR_32)
COMPATIBLE_IOCTL(DM_TABLE_DEPS_32)
COMPATIBLE_IOCTL(DM_TABLE_STATUS_32)
COMPATIBLE_IOCTL(DM_LIST_VERSIONS_32)
COMPATIBLE_IOCTL(DM_TARGET_MSG_32)
COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY_32)
COMPATIBLE_IOCTL(DM_VERSION)
COMPATIBLE_IOCTL(DM_REMOVE_ALL)
COMPATIBLE_IOCTL(DM_LIST_DEVICES)
COMPATIBLE_IOCTL(DM_DEV_CREATE)
COMPATIBLE_IOCTL(DM_DEV_REMOVE)
COMPATIBLE_IOCTL(DM_DEV_RENAME)
COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
COMPATIBLE_IOCTL(DM_DEV_STATUS)
COMPATIBLE_IOCTL(DM_DEV_WAIT)
COMPATIBLE_IOCTL(DM_TABLE_LOAD)
COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
COMPATIBLE_IOCTL(DM_TABLE_DEPS)
COMPATIBLE_IOCTL(DM_TABLE_STATUS)
COMPATIBLE_IOCTL(DM_LIST_VERSIONS)
COMPATIBLE_IOCTL(DM_TARGET_MSG)
COMPATIBLE_IOCTL(DM_DEV_SET_GEOMETRY)
/* Big K */ /* Big K */
COMPATIBLE_IOCTL(PIO_FONT) COMPATIBLE_IOCTL(PIO_FONT)
COMPATIBLE_IOCTL(GIO_FONT) COMPATIBLE_IOCTL(GIO_FONT)

View File

@ -110,15 +110,15 @@ struct target_type {
}; };
struct io_restrictions { struct io_restrictions {
unsigned int max_sectors; unsigned long bounce_pfn;
unsigned short max_phys_segments; unsigned long seg_boundary_mask;
unsigned short max_hw_segments; unsigned max_hw_sectors;
unsigned short hardsect_size; unsigned max_sectors;
unsigned int max_segment_size; unsigned max_segment_size;
unsigned int max_hw_sectors; unsigned short hardsect_size;
unsigned long seg_boundary_mask; unsigned short max_hw_segments;
unsigned long bounce_pfn; unsigned short max_phys_segments;
unsigned char no_cluster; /* inverted so that 0 is default */ unsigned char no_cluster; /* inverted so that 0 is default */
}; };
struct dm_target { struct dm_target {

View File

@ -232,36 +232,6 @@ enum {
DM_DEV_SET_GEOMETRY_CMD DM_DEV_SET_GEOMETRY_CMD
}; };
/*
* The dm_ioctl struct passed into the ioctl is just the header
* on a larger chunk of memory. On x86-64 and other
* architectures the dm-ioctl struct will be padded to an 8 byte
* boundary so the size will be different, which would change the
* ioctl code - yes I really messed up. This hack forces these
* architectures to have the correct ioctl code.
*/
#ifdef CONFIG_COMPAT
typedef char ioctl_struct[308];
#define DM_VERSION_32 _IOWR(DM_IOCTL, DM_VERSION_CMD, ioctl_struct)
#define DM_REMOVE_ALL_32 _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, ioctl_struct)
#define DM_LIST_DEVICES_32 _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, ioctl_struct)
#define DM_DEV_CREATE_32 _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, ioctl_struct)
#define DM_DEV_REMOVE_32 _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, ioctl_struct)
#define DM_DEV_RENAME_32 _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, ioctl_struct)
#define DM_DEV_SUSPEND_32 _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, ioctl_struct)
#define DM_DEV_STATUS_32 _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, ioctl_struct)
#define DM_DEV_WAIT_32 _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, ioctl_struct)
#define DM_TABLE_LOAD_32 _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, ioctl_struct)
#define DM_TABLE_CLEAR_32 _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, ioctl_struct)
#define DM_TABLE_DEPS_32 _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, ioctl_struct)
#define DM_TABLE_STATUS_32 _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, ioctl_struct)
#define DM_LIST_VERSIONS_32 _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, ioctl_struct)
#define DM_TARGET_MSG_32 _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, ioctl_struct)
#define DM_DEV_SET_GEOMETRY_32 _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, ioctl_struct)
#endif
#define DM_IOCTL 0xfd #define DM_IOCTL 0xfd
#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) #define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
@ -286,9 +256,9 @@ typedef char ioctl_struct[308];
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4 #define DM_VERSION_MAJOR 4
#define DM_VERSION_MINOR 12 #define DM_VERSION_MINOR 13
#define DM_VERSION_PATCHLEVEL 0 #define DM_VERSION_PATCHLEVEL 0
#define DM_VERSION_EXTRA "-ioctl (2007-10-02)" #define DM_VERSION_EXTRA "-ioctl (2007-10-18)"
/* Status bits */ /* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */