From f36afb3957353d2529cb2b00f78fdccd14fc5e9c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 31 Oct 2013 13:55:45 -0400 Subject: [PATCH 01/32] dm: allocate buffer for messages with small number of arguments using GFP_NOIO dm-mpath and dm-thin must process messages even if some device is suspended, so we allocate argv buffer with GFP_NOIO. These messages have a small fixed number of arguments. On the other hand, dm-switch needs to process bulk data using messages so excessive use of GFP_NOIO could cause trouble. The patch also lowers the default number of arguments from 64 to 8, so that there is smaller load on GFP_NOIO allocations. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Acked-by: Alasdair G Kergon Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8f8783533ac7..41d907b58f7e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) /* * Used to dynamically allocate the arg array. + * + * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must + * process messages even if some device is suspended. These messages have a + * small fixed number of arguments. + * + * On the other hand, dm-switch needs to process bulk data using messages and + * excessive use of GFP_NOIO could cause trouble. */ static char **realloc_argv(unsigned *array_size, char **old_argv) { char **argv; unsigned new_size; + gfp_t gfp; - new_size = *array_size ? *array_size * 2 : 64; - argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (*array_size) { + new_size = *array_size * 2; + gfp = GFP_KERNEL; + } else { + new_size = 8; + gfp = GFP_NOIO; + } + argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { memcpy(argv, old_argv, *array_size * sizeof(*argv)); *array_size = new_size; From 954a73d5d3073df2231820c718fdd2f18b0fe4c9 Mon Sep 17 00:00:00 2001 From: Shiva Krishna Merla Date: Wed, 30 Oct 2013 03:26:38 +0000 Subject: [PATCH 02/32] dm mpath: fix race condition between multipath_dtr and pg_init_done Whenever multipath_dtr() is happening we must prevent queueing any further path activation work. Implement this by adding a new 'pg_init_disabled' flag to the multipath structure that denotes future path activation work should be skipped if it is set. By disabling pg_init and then re-enabling in flush_multipath_work() we also avoid the potential for pg_init to be initiated while suspending an mpath device. Without this patch a race condition exists that may result in a kernel panic: 1) If after pg_init_done() decrements pg_init_in_progress to 0, a call to wait_for_pg_init_completion() assumes there are no more pending path management commands. 2) If pg_init_required is set by pg_init_done(), due to retryable mode_select errors, then process_queued_ios() will again queue the path activation work. 3) If free_multipath() completes before activate_path() work is called a NULL pointer dereference like the following can be seen when accessing members of the recently destructed multipath: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 RIP: 0010:[] [] activate_path+0x1b/0x30 [dm_multipath] [] worker_thread+0x170/0x2a0 [] ? autoremove_wake_function+0x0/0x40 [switch to disabling pg_init in flush_multipath_work & header edits by Mike Snitzer] Signed-off-by: Shiva Krishna Merla Reviewed-by: Krishnasamy Somasundaram Tested-by: Speagle Andy Acked-by: Junichi Nomura Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-mpath.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index de570a558764..799e479db93b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -87,6 +87,7 @@ struct multipath { unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -497,7 +498,8 @@ static void process_queued_ios(struct work_struct *work) (!pgpath && !m->queue_if_no_path)) must_queue = 0; - if (m->pg_init_required && !m->pg_init_in_progress && pgpath) + if (m->pg_init_required && !m->pg_init_in_progress && pgpath && + !m->pg_init_disabled) __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); @@ -942,10 +944,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 1; + spin_unlock_irqrestore(&m->lock, flags); + flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_work(&m->trigger_event); + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 0; + spin_unlock_irqrestore(&m->lock, flags); } static void multipath_dtr(struct dm_target *ti) @@ -1164,7 +1176,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (m->pg_init_count <= m->pg_init_retries) + if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) m->pg_init_required = 1; else limit_reached = 1; @@ -1714,7 +1726,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 5, 1}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, From b63349a7a53d34ffde70cb4feec48ea9e6f5e97b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 1 Oct 2013 11:49:56 +0200 Subject: [PATCH 03/32] dm mpath: requeue I/O during pg_init When pg_init is running no I/O can be submitted to the underlying devices, as the path priority etc might change. When using queue_io for this, requests will be piling up within multipath as the block I/O scheduler just sees a _very fast_ device. All of this queued I/O has to be resubmitted from within multipathing once pg_init is done. This approach has the problem that it's virtually impossible to abort I/O when pg_init is running, and we're adding heavy load to the devices after pg_init since all of the queued I/O needs to be resubmitted _before_ any requests can be pulled off of the request queue and normal operation continues. This patch will requeue the I/O that triggers the pg_init call, and return 'busy' when pg_init is in progress. With these changes the block I/O scheduler will stop submitting I/O during pg_init, resulting in a quicker path switch and less I/O pressure (and memory consumption) after pg_init. Signed-off-by: Hannes Reinecke [patch header edited for clarity and typos by Mike Snitzer] Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 799e479db93b..6eb9dc9ef8f3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -391,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone, if (was_queued) m->queue_size--; - if ((pgpath && m->queue_io) || - (!pgpath && m->queue_if_no_path)) { + if (m->pg_init_required) { + if (!m->pg_init_in_progress) + queue_work(kmultipathd, &m->process_queued_ios); + r = DM_MAPIO_REQUEUE; + } else if ((pgpath && m->queue_io) || + (!pgpath && m->queue_if_no_path)) { /* Queue for the daemon to resubmit */ list_add_tail(&clone->queuelist, &m->queued_ios); m->queue_size++; - if ((m->pg_init_required && !m->pg_init_in_progress) || - !m->queue_io) + if (!m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; r = DM_MAPIO_SUBMITTED; @@ -1677,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti) spin_lock_irqsave(&m->lock, flags); + /* pg_init in progress, requeue until done */ + if (m->pg_init_in_progress) { + busy = 1; + goto out; + } /* Guess which priority_group will be used at next mapping time */ if (unlikely(!m->current_pgpath && m->next_pg)) pg = m->next_pg; From 9c1d4de56066e4d6abc66ec188faafd7b303fb08 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 30 Oct 2013 11:19:59 +0000 Subject: [PATCH 04/32] dm array: fix bug in growing array Entries would be lost if the old tail block was partially filled. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.9+ --- drivers/md/persistent-data/dm-array.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 172147eb1d40..af96e24ec328 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize) static int grow_needs_more_blocks(struct resize *resize) { int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + r = grow_extend_tail_block(resize, resize->max_entries); if (r) return r; } r = insert_full_ablocks(resize->info, resize->size_of_block, - resize->old_nr_full_blocks, + old_nr_blocks, resize->new_nr_full_blocks, resize->max_entries, resize->value, &resize->root); From 99ba2ae4cd876bbcedb01d94c1a7952ce171418e Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 21 Oct 2013 11:44:57 +0100 Subject: [PATCH 05/32] dm cache policy mq: protect residency method with existing mutex It is safe to use a mutex in mq_residency() at this point since it is only called from ioctl context. But future-proof mq_residency() by using might_sleep() to catch new contexts that cannot sleep. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 4296155090b2..db490f74c7f8 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -1001,10 +1001,14 @@ static void mq_force_mapping(struct dm_cache_policy *p, static dm_cblock_t mq_residency(struct dm_cache_policy *p) { + dm_cblock_t r; struct mq_policy *mq = to_mq_policy(p); - /* FIXME: lock mutex, not sure we can block here */ - return to_cblock(mq->nr_cblocks_allocated); + mutex_lock(&mq->lock); + r = to_cblock(mq->nr_cblocks_allocated); + mutex_unlock(&mq->lock); + + return r; } static void mq_tick(struct dm_cache_policy *p) From f8e5f01a3266e68e29024edc2bf2dbf81a864f41 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 21 Oct 2013 12:51:45 +0100 Subject: [PATCH 06/32] dm cache: io destined for the cache device can now serve as tick bios Previously only origin bios could trigger ticks, which meant if all the io was destined for the cache no ticks were generated. If no ticks are generated then multiple hits, and movements in general, are attributed to the same tick. Only a stop gap fix, we need a better solution. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 29569768ffbf..c1e92664307c 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -605,6 +605,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { + check_if_tick_bio_needed(cache, bio); remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { set_dirty(cache, oblock, cblock); From 66cb1910df17b38334153462ec8166e48058035f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 30 Oct 2013 17:11:58 +0000 Subject: [PATCH 07/32] dm cache: fix a race condition between queuing new migrations and quiescing for a shutdown The code that was trying to do this was inadequate. The postsuspend method (in ioctl context), needs to wait for the worker thread to acknowledge the request to quiesce. Otherwise the migration count may drop to zero temporarily before the worker thread realises we're quiescing. In this case the target will be taken down, but the worker thread may have issued a new migration, which will cause an oops when it completes. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.9+ --- drivers/md/dm-cache-target.c | 74 ++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index c1e92664307c..25d3253e72d0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -148,6 +148,9 @@ struct cache { wait_queue_head_t migration_wait; atomic_t nr_migrations; + wait_queue_head_t quiescing_wait; + atomic_t quiescing_ack; + /* * cache_size entries, dirty if set */ @@ -749,8 +752,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, static void cleanup_migration(struct dm_cache_migration *mg) { - dec_nr_migrations(mg->cache); + struct cache *cache = mg->cache; free_migration(mg); + dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -1347,24 +1351,6 @@ static void writeback_some_dirty_blocks(struct cache *cache) /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ -static void start_quiescing(struct cache *cache) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 1; - spin_unlock_irqrestore(&cache->lock, flags); -} - -static void stop_quiescing(struct cache *cache) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 0; - spin_unlock_irqrestore(&cache->lock, flags); -} - static bool is_quiescing(struct cache *cache) { int r; @@ -1377,6 +1363,41 @@ static bool is_quiescing(struct cache *cache) return r; } +static void ack_quiescing(struct cache *cache) +{ + if (is_quiescing(cache)) { + atomic_inc(&cache->quiescing_ack); + wake_up(&cache->quiescing_wait); + } +} + +static void wait_for_quiescing_ack(struct cache *cache) +{ + wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); +} + +static void start_quiescing(struct cache *cache) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + cache->quiescing = true; + spin_unlock_irqrestore(&cache->lock, flags); + + wait_for_quiescing_ack(cache); +} + +static void stop_quiescing(struct cache *cache) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + cache->quiescing = false; + spin_unlock_irqrestore(&cache->lock, flags); + + atomic_set(&cache->quiescing_ack, 0); +} + static void wait_for_migrations(struct cache *cache) { wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); @@ -1421,16 +1442,15 @@ static void do_worker(struct work_struct *ws) struct cache *cache = container_of(ws, struct cache, worker); do { - if (!is_quiescing(cache)) + if (!is_quiescing(cache)) { + writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); process_deferred_bios(cache); + } process_migrations(cache, &cache->quiesced_migrations, issue_copy); process_migrations(cache, &cache->completed_migrations, complete_migration); - writeback_some_dirty_blocks(cache); - - process_deferred_writethrough_bios(cache); - if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1443,6 +1463,9 @@ static void do_worker(struct work_struct *ws) process_migrations(cache, &cache->need_commit_migrations, migration_success_post_commit); } + + ack_quiescing(cache); + } while (more_work(cache)); } @@ -2006,6 +2029,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); + init_waitqueue_head(&cache->quiescing_wait); + atomic_set(&cache->quiescing_ack, 0); + r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); From 238f8363b6661fd9dd9aae854917ab9c661f3652 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 30 Oct 2013 17:29:30 +0000 Subject: [PATCH 08/32] dm cache: improve efficiency of quiescing flag management Make the quiescing flag an atomic_t and stop protecting it with a spin lock. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 25d3253e72d0..50afdf72285b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -149,6 +149,7 @@ struct cache { atomic_t nr_migrations; wait_queue_head_t quiescing_wait; + atomic_t quiescing; atomic_t quiescing_ack; /* @@ -189,7 +190,6 @@ struct cache { bool need_tick_bio:1; bool sized:1; - bool quiescing:1; bool commit_requested:1; bool loaded_mappings:1; bool loaded_discards:1; @@ -1353,14 +1353,7 @@ static void writeback_some_dirty_blocks(struct cache *cache) *--------------------------------------------------------------*/ static bool is_quiescing(struct cache *cache) { - int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - r = cache->quiescing; - spin_unlock_irqrestore(&cache->lock, flags); - - return r; + return atomic_read(&cache->quiescing); } static void ack_quiescing(struct cache *cache) @@ -1378,23 +1371,13 @@ static void wait_for_quiescing_ack(struct cache *cache) static void start_quiescing(struct cache *cache) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = true; - spin_unlock_irqrestore(&cache->lock, flags); - + atomic_inc(&cache->quiescing); wait_for_quiescing_ack(cache); } static void stop_quiescing(struct cache *cache) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = false; - spin_unlock_irqrestore(&cache->lock, flags); - + atomic_set(&cache->quiescing, 0); atomic_set(&cache->quiescing_ack, 0); } @@ -2030,6 +2013,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) init_waitqueue_head(&cache->migration_wait); init_waitqueue_head(&cache->quiescing_wait); + atomic_set(&cache->quiescing, 0); atomic_set(&cache->quiescing_ack, 0); r = -ENOMEM; @@ -2091,7 +2075,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->need_tick_bio = true; cache->sized = false; - cache->quiescing = false; cache->commit_requested = false; cache->loaded_mappings = false; cache->loaded_discards = false; From 3351937e4a6054a925d306e36c4cddc7723b1579 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:28 -0400 Subject: [PATCH 09/32] dm cache policy: remove return from void policy_remove_mapping No need to return from a void function. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 0928abdc49f0..a75f7e7498eb 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -61,7 +61,7 @@ static inline int policy_writeback_work(struct dm_cache_policy *p, static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { - return p->remove_mapping(p, oblock); + p->remove_mapping(p, oblock); } static inline void policy_force_mapping(struct dm_cache_policy *p, From 0184b44e321dda893d4d4be33499d404718c3a86 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:28 -0400 Subject: [PATCH 10/32] dm cache policy mq: a few small fixes Rename takeout_queue to concat_queue. Fix a harmless bug in mq policies pop() function. Currently pop() always succeeds, with up coming changes this wont be the case. Fix typo in comment above pre_cache_to_cache prototype. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index db490f74c7f8..a9a25de5b011 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -311,7 +311,7 @@ struct mq_policy { /*----------------------------------------------------------------*/ /* Free/alloc mq cache entry structures. */ -static void takeout_queue(struct list_head *lh, struct queue *q) +static void concat_queue(struct list_head *lh, struct queue *q) { unsigned level; @@ -323,8 +323,8 @@ static void free_entries(struct mq_policy *mq) { struct entry *e, *tmp; - takeout_queue(&mq->free, &mq->pre_cache); - takeout_queue(&mq->free, &mq->cache); + concat_queue(&mq->free, &mq->pre_cache); + concat_queue(&mq->free, &mq->cache); list_for_each_entry_safe(e, tmp, &mq->free, list) kmem_cache_free(mq_entry_cache, e); @@ -531,14 +531,16 @@ static void del(struct mq_policy *mq, struct entry *e) */ static struct entry *pop(struct mq_policy *mq, struct queue *q) { - struct entry *e = container_of(queue_pop(q), struct entry, list); + struct entry *e; + struct list_head *h = queue_pop(q); - if (e) { - hash_remove(e); + if (!h) + return NULL; - if (e->in_cache) - free_cblock(mq, e->cblock); - } + e = container_of(h, struct entry, list); + hash_remove(e); + if (e->in_cache) + free_cblock(mq, e->cblock); return e; } @@ -697,7 +699,7 @@ static int cache_entry_found(struct mq_policy *mq, } /* - * Moves and entry from the pre_cache to the cache. The main work is + * Moves an entry from the pre_cache to the cache. The main work is * finding which cache block to use. */ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, From dd8b0c2096e53b336324e99455efcc498599ba0f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:28 -0400 Subject: [PATCH 11/32] dm cache metadata: return bool from __superblock_all_zeroes Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1af7255bbffb..2262b4e57a28 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -198,7 +198,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd, /*----------------------------------------------------------------*/ -static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) { int r; unsigned i; @@ -214,10 +214,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) return r; data_le = dm_block_data(b); - *result = 1; + *result = true; for (i = 0; i < sb_block_size; i++) { if (data_le[i] != zero) { - *result = 0; + *result = false; break; } } @@ -411,7 +411,8 @@ bad: static int __open_or_format_metadata(struct dm_cache_metadata *cmd, bool format_device) { - int r, unformatted; + int r; + bool unformatted = false; r = __superblock_all_zeroes(cmd->bm, &unformatted); if (r) From 4cb3e1db21a94781bbf05238687c3e8a715ab2f9 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 1 Oct 2013 18:35:39 -0400 Subject: [PATCH 12/32] dm cache: return -EINVAL if the user specifies unknown cache policy Return -EINVAL when the specified cache policy is unknown rather than returning -ENOMEM. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy.c | 4 ++-- drivers/md/dm-cache-target.c | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 21c03c570c06..d80057968407 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name, type = get_policy(name); if (!type) { DMWARN("unknown policy type"); - return NULL; + return ERR_PTR(-EINVAL); } p = type->create(cache_size, origin_size, cache_block_size); if (!p) { put_policy(type); - return NULL; + return ERR_PTR(-ENOMEM); } p->private = type; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 50afdf72285b..81fe85757585 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1879,14 +1879,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv) static int create_cache_policy(struct cache *cache, struct cache_args *ca, char **error) { - cache->policy = dm_cache_policy_create(ca->policy_name, - cache->cache_size, - cache->origin_sectors, - cache->sectors_per_block); - if (!cache->policy) { + struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, + cache->cache_size, + cache->origin_sectors, + cache->sectors_per_block); + if (IS_ERR(p)) { *error = "Error creating cache's policy"; - return -ENOMEM; + return PTR_ERR(p); } + cache->policy = p; return 0; } From 80f659f3f546beddc5abbec4f1c5f45d22d81348 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 14 Oct 2013 17:10:47 +0200 Subject: [PATCH 13/32] dm cache: use cell_defer() boolean argument consistently Fix a few cell_defer() calls that weren't passing a bool. Signed-off-by: Heinz Mauelshagen Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 81fe85757585..dc63eb2aa69c 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -770,13 +770,13 @@ static void migration_failure(struct dm_cache_migration *mg) DMWARN_LIMIT("demotion failed; couldn't copy block"); policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); - cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); if (mg->promote) - cell_defer(cache, mg->new_ocell, 1); + cell_defer(cache, mg->new_ocell, true); } else { DMWARN_LIMIT("promotion failed; couldn't copy block"); policy_remove_mapping(cache->policy, mg->new_oblock); - cell_defer(cache, mg->new_ocell, 1); + cell_defer(cache, mg->new_ocell, true); } cleanup_migration(mg); @@ -828,7 +828,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) return; } else if (mg->demote) { - cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); if (mg->promote) { mg->demote = false; From 2c2263c93f70c6abdce90ad96a854760532aa52f Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 14 Oct 2013 17:14:45 +0200 Subject: [PATCH 14/32] dm cache: log error message if dm_kcopyd_copy() fails A migration failure should be logged (albeit limited). Signed-off-by: Heinz Mauelshagen Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index dc63eb2aa69c..05a10c02043f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -886,8 +886,10 @@ static void issue_copy_real(struct dm_cache_migration *mg) r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); } - if (r < 0) + if (r < 0) { + DMERR_LIMIT("issuing migration failed"); migration_failure(mg); + } } static void avoid_copy(struct dm_cache_migration *mg) From da31a0787a2ac92dd219ce0d33322160b66d6a01 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Mon, 28 Oct 2013 23:21:03 +0100 Subject: [PATCH 15/32] dm crypt: properly handle extra key string in initialization Some encryption modes use extra keys (e.g. loopAES has IV seed) which are not used in block cipher initialization but are part of key string in table constructor. This patch adds an additional field which describes the length of the extra key(s) and substracts it before real key encryption setting. The key_size always includes the size, in bytes, of the key provided in mapping table. The key_parts describes how many parts (usually keys) are contained in the whole key buffer. And key_extra_size contains size in bytes of additional keys part (this number of bytes must be subtracted because it is processed by the IV generator). | K1 | K2 | .... | K64 | Kiv | |----------- key_size ----------------- | | |-key_extra_size-| | [64 keys] | [1 key] | => key_parts = 65 Example where key string contains main key K, whitening key Kw and IV seed Kiv: | K | Kiv | Kw | |--------------- key_size --------------| | |-----key_extra_size------| | [1 key] | [1 key] | [1 key] | => key_parts = 3 Because key_extra_size is calculated during IV mode setting, key initialization is moved after this step. For now, this change has no effect to supported modes (thanks to ilog2 rounding) but it is required by the following patch. Also, fix a sparse warning in crypt_iv_lmk_one(). Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0fce0bc1a957..e0c61a326550 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -171,7 +171,8 @@ struct crypt_config { unsigned long flags; unsigned int key_size; - unsigned int key_parts; + unsigned int key_parts; /* independent parts in key buffer */ + unsigned int key_extra_size; /* additional keys length */ u8 key[0]; }; @@ -530,7 +531,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, char ctx[crypto_shash_descsize(lmk->hash_tfm)]; } sdesc; struct md5_state md5state; - u32 buf[4]; + __le32 buf[4]; int i, r; sdesc.desc.tfm = lmk->hash_tfm; @@ -1274,9 +1275,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) static int crypt_setkey_allcpus(struct crypt_config *cc) { - unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + unsigned subkey_size; int err = 0, i, r; + /* Ignore extra keys (which are used for IV etc) */ + subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); + for (i = 0; i < cc->tfms_count; i++) { r = crypto_ablkcipher_setkey(cc->tfms[i], cc->key + (i * subkey_size), @@ -1409,6 +1413,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } cc->key_parts = cc->tfms_count; + cc->key_extra_size = 0; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) @@ -1460,13 +1465,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, goto bad; } - /* Initialize and set key */ - ret = crypt_set_key(cc, key); - if (ret < 0) { - ti->error = "Error decoding and setting key"; - goto bad; - } - /* Initialize IV */ cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) @@ -1497,14 +1495,23 @@ static int crypt_ctr_cipher(struct dm_target *ti, * to length of provided multi-key string. * If present (version 3), last key is used as IV seed. */ - if (cc->key_size % cc->key_parts) + if (cc->key_size % cc->key_parts) { cc->key_parts++; + cc->key_extra_size = cc->key_size / cc->key_parts; + } } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; } + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { + ti->error = "Error decoding and setting key"; + goto bad; + } + /* Allocate IV */ if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); From ed04d98169f1c33ebc79f510c855eed83924d97f Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Mon, 28 Oct 2013 23:21:04 +0100 Subject: [PATCH 16/32] dm crypt: add TCW IV mode for old CBC TCRYPT containers dm-crypt can already activate TCRYPT (TrueCrypt compatible) containers in LRW or XTS block encryption mode. TCRYPT containers prior to version 4.1 use CBC mode with some additional tweaks, this patch adds support for these containers. This new mode is implemented using special IV generator named TCW (TrueCrypt IV with whitening). TCW IV only supports containers that are encrypted with one cipher (Tested with AES, Twofish, Serpent, CAST5 and TripleDES). While this mode is legacy and is known to be vulnerable to some watermarking attacks (e.g. revealing of hidden disk existence) it can still be useful to activate old containers without using 3rd party software or for independent forensic analysis of such containers. (Both the userspace and kernel code is an independent implementation based on the format documentation and it completely avoids use of original source code.) The TCW IV generator uses two additional keys: Kw (whitening seed, size is always 16 bytes - TCW_WHITENING_SIZE) and Kiv (IV seed, size is always the IV size of the selected cipher). These keys are concatenated at the end of the main encryption key provided in mapping table. While whitening is completely independent from IV, it is implemented inside IV generator for simplification. The whitening value is always 16 bytes long and is calculated per sector from provided Kw as initial seed, xored with sector number and mixed with CRC32 algorithm. Resulting value is xored with ciphertext sector content. IV is calculated from the provided Kiv as initial IV seed and xored with sector number. Detailed calculation can be found in the Truecrypt documentation for version < 4.1 and will also be described on dm-crypt site, see: http://code.google.com/p/cryptsetup/wiki/DMCrypt The experimental support for activation of these containers is already present in git devel brach of cryptsetup. Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-crypt.txt | 11 +- drivers/md/dm-crypt.c | 185 ++++++++++++++++++++++- 2 files changed, 192 insertions(+), 4 deletions(-) diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 2c656ae43ba7..c81839b52c4d 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -4,12 +4,15 @@ dm-crypt Device-Mapper's "crypt" target provides transparent encryption of block devices using the kernel crypto API. +For a more detailed description of supported parameters see: +http://code.google.com/p/cryptsetup/wiki/DMCrypt + Parameters: \ [<#opt_params> ] Encryption cipher and an optional IV generation mode. - (In format cipher[:keycount]-chainmode-ivopts:ivmode). + (In format cipher[:keycount]-chainmode-ivmode[:ivopts]). Examples: des aes-cbc-essiv:sha256 @@ -19,7 +22,11 @@ Parameters: \ Key used for encryption. It is encoded as a hexadecimal number. - You can only use key sizes that are valid for the selected cipher. + You can only use key sizes that are valid for the selected cipher + in combination with the selected iv mode. + Note that for some iv modes the key string can contain additional + keys (for example IV seed) so the key contains more parts concatenated + into a single string. Multi-key compatibility mode. You can define keys and diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e0c61a326550..50ea7ed24dce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2,6 +2,7 @@ * Copyright (C) 2003 Christophe Saout * Copyright (C) 2004 Clemens Fruhwirth * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013 Milan Broz * * This file is released under the GPL. */ @@ -98,6 +99,13 @@ struct iv_lmk_private { u8 *seed; }; +#define TCW_WHITENING_SIZE 16 +struct iv_tcw_private { + struct crypto_shash *crc32_tfm; + u8 *iv_seed; + u8 *whitening; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -139,6 +147,7 @@ struct crypt_config { struct iv_essiv_private essiv; struct iv_benbi_private benbi; struct iv_lmk_private lmk; + struct iv_tcw_private tcw; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -231,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) * version 3: the same as version 2 with additional IV seed * (it uses 65 keys, last key is used as IV seed) * + * tcw: Compatible implementation of the block chaining mode used + * by the TrueCrypt device encryption system (prior to version 4.1). + * For more info see: http://www.truecrypt.org + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from initial key and the sector number. + * In addition, whitening value is applied on every sector, whitening + * is calculated from initial key, sector number and mixed using CRC32. + * Note that this encryption scheme is vulnerable to watermarking attacks + * and should be used for old compatible containers access only. + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -609,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, return r; } +static void crypt_iv_tcw_dtr(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + kzfree(tcw->iv_seed); + tcw->iv_seed = NULL; + kzfree(tcw->whitening); + tcw->whitening = NULL; + + if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) + crypto_free_shash(tcw->crc32_tfm); + tcw->crc32_tfm = NULL; +} + +static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { + ti->error = "Wrong key size for TCW"; + return -EINVAL; + } + + tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(tcw->crc32_tfm)) { + ti->error = "Error initializing CRC32 in TCW"; + return PTR_ERR(tcw->crc32_tfm); + } + + tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); + tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); + if (!tcw->iv_seed || !tcw->whitening) { + crypt_iv_tcw_dtr(cc); + ti->error = "Error allocating seed storage in TCW"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_tcw_init(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE; + + memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size); + memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size], + TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_wipe(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + memset(tcw->iv_seed, 0, cc->iv_size); + memset(tcw->whitening, 0, TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 buf[TCW_WHITENING_SIZE]; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(tcw->crc32_tfm)]; + } sdesc; + int i, r; + + /* xor whitening with sector number */ + memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE); + crypto_xor(buf, (u8 *)§or, 8); + crypto_xor(&buf[8], (u8 *)§or, 8); + + /* calculate crc32 for every 32bit part and xor it */ + sdesc.desc.tfm = tcw->crc32_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + for (i = 0; i < 4; i++) { + r = crypto_shash_init(&sdesc.desc); + if (r) + goto out; + r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4); + if (r) + goto out; + r = crypto_shash_final(&sdesc.desc, &buf[i * 4]); + if (r) + goto out; + } + crypto_xor(&buf[0], &buf[12], 4); + crypto_xor(&buf[4], &buf[8], 4); + + /* apply whitening (8 bytes) to whole sector */ + for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) + crypto_xor(data + i * 8, buf, 8); +out: + memset(buf, 0, sizeof(buf)); + return r; +} + +static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 *src; + int r = 0; + + /* Remove whitening from ciphertext */ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in)); + r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src); + } + + /* Calculate IV */ + memcpy(iv, tcw->iv_seed, cc->iv_size); + crypto_xor(iv, (u8 *)§or, 8); + if (cc->iv_size > 8) + crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8); + + return r; +} + +static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return 0; + + /* Apply whitening on ciphertext */ + dst = kmap_atomic(sg_page(&dmreq->sg_out)); + r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); + kunmap_atomic(dst); + + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -644,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = { .post = crypt_iv_lmk_post }; +static struct crypt_iv_operations crypt_iv_tcw_ops = { + .ctr = crypt_iv_tcw_ctr, + .dtr = crypt_iv_tcw_dtr, + .init = crypt_iv_tcw_init, + .wipe = crypt_iv_tcw_wipe, + .generator = crypt_iv_tcw_gen, + .post = crypt_iv_tcw_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -1491,14 +1666,20 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_null_ops; else if (strcmp(ivmode, "lmk") == 0) { cc->iv_gen_ops = &crypt_iv_lmk_ops; - /* Version 2 and 3 is recognised according + /* + * Version 2 and 3 is recognised according * to length of provided multi-key string. * If present (version 3), last key is used as IV seed. + * All keys (including IV seed) are always the same size. */ if (cc->key_size % cc->key_parts) { cc->key_parts++; cc->key_extra_size = cc->key_size / cc->key_parts; } + } else if (strcmp(ivmode, "tcw") == 0) { + cc->iv_gen_ops = &crypt_iv_tcw_ops; + cc->key_parts += 2; /* IV + whitening */ + cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; } else { ret = -EINVAL; ti->error = "Invalid IV mode"; @@ -1824,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 12, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 7833b08e18241a1c35c09ef38be840cbf6c58acf Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Oct 2013 14:10:29 -0400 Subject: [PATCH 17/32] dm table: print error on preresume failure If preresume fails it is worth logging an error given that a device is left suspended due to the failure. This change was motivated by local preresume error logging that was added to the cache target ("preresume failed"). Elevating this target-agnostic context for the where the target-specific error occurred relative to the DM core's callouts makes sense. Signed-off-by: Mike Snitzer Signed-off-by: Joe Thornber --- drivers/md/dm-table.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41d907b58f7e..465f08ca62b1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1562,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t) continue; r = ti->type->preresume(ti); - if (r) + if (r) { + DMERR("%s: %s: preresume failed, error = %d", + dm_device_name(t->md), ti->type->name, r); return r; + } } for (i = 0; i < t->num_targets; i++) { From 2c140a246dc0bc085b98eddde978060fcec1080c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 1 Nov 2013 18:27:41 -0400 Subject: [PATCH 18/32] dm: allow remove to be deferred This patch allows the removal of an open device to be deferred until it is closed. (Previously such a removal attempt would fail.) The deferred remove functionality is enabled by setting the flag DM_DEFERRED_REMOVE in the ioctl structure on DM_DEV_REMOVE or DM_REMOVE_ALL ioctl. On return from DM_DEV_REMOVE, the flag DM_DEFERRED_REMOVE indicates if the device was removed immediately or flagged to be removed on close - if the flag is clear, the device was removed. On return from DM_DEV_STATUS and other ioctls, the flag DM_DEFERRED_REMOVE is set if the device is scheduled to be removed on closure. A device that is scheduled to be deleted can be revived using the message "@cancel_deferred_remove". This message clears the DMF_DEFERRED_REMOVE flag so that the device won't be deleted on close. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 36 ++++++++++++++++++++++----- drivers/md/dm.c | 47 ++++++++++++++++++++++++++++++++--- drivers/md/dm.h | 13 +++++++++- include/uapi/linux/dm-ioctl.h | 15 +++++++++-- 4 files changed, 99 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index afe08146f73e..51521429fb59 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -57,7 +57,7 @@ struct vers_iter { static struct list_head _name_buckets[NUM_BUCKETS]; static struct list_head _uuid_buckets[NUM_BUCKETS]; -static void dm_hash_remove_all(int keep_open_devices); +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); /* * Guards access to both hash tables. @@ -86,7 +86,7 @@ static int dm_hash_init(void) static void dm_hash_exit(void) { - dm_hash_remove_all(0); + dm_hash_remove_all(false, false, false); } /*----------------------------------------------------------------- @@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) return table; } -static void dm_hash_remove_all(int keep_open_devices) +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) { int i, dev_skipped; struct hash_cell *hc; @@ -293,7 +293,8 @@ retry: md = hc->md; dm_get(md); - if (keep_open_devices && dm_lock_for_deletion(md)) { + if (keep_open_devices && + dm_lock_for_deletion(md, mark_deferred, only_deferred)) { dm_put(md); dev_skipped++; continue; @@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, return md; } +void dm_deferred_remove(void) +{ + dm_hash_remove_all(true, false, true); +} + /*----------------------------------------------------------------- * Implementation of the ioctl commands *---------------------------------------------------------------*/ @@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); static int remove_all(struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(1); + dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); param->data_size = 0; return 0; } @@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; + if (dm_test_deferred_remove_flag(md)) + param->flags |= DM_DEFERRED_REMOVE; + param->dev = huge_encode_dev(disk_devt(disk)); /* @@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) /* * Ensure the device is not open and nothing further can open it. */ - r = dm_lock_for_deletion(md); + r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false); if (r) { + if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) { + up_write(&_hash_lock); + dm_put(md); + return 0; + } DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); @@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) dm_table_destroy(t); } + param->flags &= ~DM_DEFERRED_REMOVE; + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, if (**argv != '@') return 2; /* no '@' prefix, deliver to target */ + if (!strcasecmp(argv[0], "@cancel_deferred_remove")) { + if (argc != 1) { + DMERR("Invalid arguments for @cancel_deferred_remove"); + return -EINVAL; + } + return dm_cancel_deferred_remove(md); + } + r = dm_stats_message(md, argc, argv, result, maxlen); if (r < 2) return r; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b3e26c7d1417..0704c523a76b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -49,6 +49,11 @@ static unsigned int _major = 0; static DEFINE_IDR(_minor_idr); static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + /* * For bio-based dm. * One of these is allocated per bio. @@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 +#define DMF_DEFERRED_REMOVE 7 /* * A dummy definition to make RCU happy. @@ -299,6 +305,8 @@ out_free_io_cache: static void local_exit(void) { + flush_scheduled_work(); + kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); @@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode) spin_lock(&_minor_lock); - atomic_dec(&md->open_count); + if (atomic_dec_and_test(&md->open_count) && + (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + schedule_work(&deferred_remove_work); + dm_put(md); spin_unlock(&_minor_lock); @@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md) /* * Guarantees nothing is using the device before it's deleted. */ -int dm_lock_for_deletion(struct mapped_device *md) +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) { int r = 0; spin_lock(&_minor_lock); - if (dm_open_count(md)) + if (dm_open_count(md)) { r = -EBUSY; + if (mark_deferred) + set_bit(DMF_DEFERRED_REMOVE, &md->flags); + } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) + r = -EEXIST; else set_bit(DMF_DELETING, &md->flags); @@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md) return r; } +int dm_cancel_deferred_remove(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (test_bit(DMF_DELETING, &md->flags)) + r = -EBUSY; + else + clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ + dm_deferred_remove(); +} + sector_t dm_get_size(struct mapped_device *md) { return get_capacity(md->disk); @@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_test_deferred_remove_flag(struct mapped_device *md) +{ + return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} + int dm_suspended(struct dm_target *ti) { return dm_suspended_md(dm_table_get_md(ti->table)); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1d1ad7b7e527..c57ba550f69e 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -128,6 +128,16 @@ int dm_deleting_md(struct mapped_device *md); */ int dm_suspended_md(struct mapped_device *md); +/* + * Test if the device is scheduled for deferred remove. + */ +int dm_test_deferred_remove_flag(struct mapped_device *md); + +/* + * Try to remove devices marked for deferred removal. + */ +void dm_deferred_remove(void); + /* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. @@ -158,7 +168,8 @@ void dm_stripe_exit(void); void dm_destroy(struct mapped_device *md); void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); -int dm_lock_for_deletion(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); +int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); sector_t dm_get_size(struct mapped_device *md); struct dm_stats *dm_get_stats(struct mapped_device *md); diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index f1e12bd40b3b..c8a4302093a3 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 26 +#define DM_VERSION_MINOR 27 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2013-08-15)" +#define DM_VERSION_EXTRA "-ioctl (2013-10-30)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -341,4 +341,15 @@ enum { */ #define DM_DATA_OUT_FLAG (1 << 16) /* Out */ +/* + * If set with DM_DEV_REMOVE or DM_REMOVE_ALL this indicates that if + * the device cannot be removed immediately because it is still in use + * it should instead be scheduled for removal when it gets closed. + * + * On return from DM_DEV_REMOVE, DM_DEV_STATUS or other ioctls, this + * flag indicates that the device is scheduled to be removed when it + * gets closed. + */ +#define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */ + #endif /* _LINUX_DM_IOCTL_H */ From 5442851edb28d7b8dfd98023b2a10c65b93f07a1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 8 Nov 2013 10:47:12 -0500 Subject: [PATCH 19/32] dm: fix Kconfig menu indentation The option DM_LOG_USERSPACE is sub-option of DM_MIRROR, so place it right after DM_MIRROR. Doing so fixes various other Device mapper targets/features to be properly nested under "Device mapper support". Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 30b426ed744b..f2ccbc3b9fe4 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -297,6 +297,17 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_LOG_USERSPACE + tristate "Mirror userspace logging" + depends on DM_MIRROR && NET + select CONNECTOR + ---help--- + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + config DM_RAID tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM @@ -323,17 +334,6 @@ config DM_RAID RAID-5, RAID-6 distributes the syndromes across the drives in one of the available parity distribution methods. -config DM_LOG_USERSPACE - tristate "Mirror userspace logging" - depends on DM_MIRROR && NET - select CONNECTOR - ---help--- - The userspace logging module provides a mechanism for - relaying the dm-dirty-log API to userspace. Log designs - which are more suited to userspace implementation (e.g. - shared storage logs) or experimental logs can be implemented - by leveraging this framework. - config DM_ZERO tristate "Zero target" depends on BLK_DEV_DM From 41d35d25e9d4e1b5e944b70f1bec272bcff5f489 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 4 Nov 2013 19:42:38 -0500 Subject: [PATCH 20/32] MAINTAINERS: add reference to device-mapper's linux-dm.git tree Signed-off-by: Mike Snitzer --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8a0cbf3cf2c8..d6bc019b796b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2637,6 +2637,7 @@ M: dm-devel@redhat.com L: dm-devel@redhat.com W: http://sources.redhat.com/dm Q: http://patchwork.kernel.org/project/dm-devel/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git T: quilt http://people.redhat.com/agk/patches/linux/editing/ S: Maintained F: Documentation/device-mapper/ From 40c57f475f416ab241124e1e6a593b2f982eaae3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 9 Aug 2013 14:19:32 +0100 Subject: [PATCH 21/32] dm space map disk: optimise sm_disk_dec_block Don't waste time spotting blocks that have been allocated and then freed in the same transaction. The extra lookup is expensive, and I don't think it really gives us much. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-space-map-disk.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index e735a6d5a793..cfbf9617e465 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { - int r; - uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - } - - return r; + return sm_ll_dec(&smd->ll, b, &ev); } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) From ffcbcb6720ab6a4bb6e0a51b3711e8c60872d281 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 14 Oct 2013 17:24:43 +0200 Subject: [PATCH 22/32] dm cache: optimize commit_if_needed Check commit_requested flag _before_ calling dm_cache_changed_this_transaction() superfluously. Also, be sure to set last_commit_jiffies _after_ dm_cache_commit() completes. Signed-off-by: Heinz Mauelshagen Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 05a10c02043f..4b564069e08f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1234,15 +1234,17 @@ static int need_commit_due_to_time(struct cache *cache) static int commit_if_needed(struct cache *cache) { - if (dm_cache_changed_this_transaction(cache->cmd) && - (cache->commit_requested || need_commit_due_to_time(cache))) { + int r = 0; + + if ((cache->commit_requested || need_commit_due_to_time(cache)) && + dm_cache_changed_this_transaction(cache->cmd)) { atomic_inc(&cache->stats.commit_count); - cache->last_commit_jiffies = jiffies; cache->commit_requested = false; - return dm_cache_commit(cache->cmd, false); + r = dm_cache_commit(cache->cmd, false); + cache->last_commit_jiffies = jiffies; } - return 0; + return r; } static void process_deferred_bios(struct cache *cache) From 01911c19bea63b1a958b9d9024504c2e9079f155 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:28 -0400 Subject: [PATCH 23/32] dm cache policy mq: implement writeback_work() and mq_{set,clear}_dirty() There are now two multiqueues for in cache blocks. A clean one and a dirty one. writeback_work comes from the dirty one. Demotions come from the clean one. There are two benefits: - Performance improvement, since demoting a clean block is a noop. - The cache cleans itself when io load is light. Signed-off-by: Joe Thornber Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- .../device-mapper/cache-policies.txt | 6 +- drivers/md/dm-cache-policy-mq.c | 147 +++++++++++++++--- 2 files changed, 132 insertions(+), 21 deletions(-) diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt index d7c440b444cc..df52a849957f 100644 --- a/Documentation/device-mapper/cache-policies.txt +++ b/Documentation/device-mapper/cache-policies.txt @@ -30,8 +30,10 @@ multiqueue This policy is the default. -The multiqueue policy has two sets of 16 queues: one set for entries -waiting for the cache and another one for those in the cache. +The multiqueue policy has three sets of 16 queues: one set for entries +waiting for the cache and another two for those in the cache (a set for +clean entries and a set for dirty entries). + Cache entries in the queues are aged based on logical time. Entry into the cache is based on variable thresholds and queue selection is based on hit count on entry. The policy aims to take different cache miss diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index a9a25de5b011..6710e038c730 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -224,6 +224,7 @@ struct entry { * FIXME: pack these better */ bool in_cache:1; + bool dirty:1; unsigned hit_count; unsigned generation; unsigned tick; @@ -238,13 +239,15 @@ struct mq_policy { struct io_tracker tracker; /* - * We maintain two queues of entries. The cache proper contains - * the currently active mappings. Whereas the pre_cache tracks - * blocks that are being hit frequently and potential candidates - * for promotion to the cache. + * We maintain three queues of entries. The cache proper, + * consisting of a clean and dirty queue, contains the currently + * active mappings. Whereas the pre_cache tracks blocks that + * are being hit frequently and potential candidates for promotion + * to the cache. */ struct queue pre_cache; - struct queue cache; + struct queue cache_clean; + struct queue cache_dirty; /* * Keeps track of time, incremented by the core. We use this to @@ -324,7 +327,8 @@ static void free_entries(struct mq_policy *mq) struct entry *e, *tmp; concat_queue(&mq->free, &mq->pre_cache); - concat_queue(&mq->free, &mq->cache); + concat_queue(&mq->free, &mq->cache_clean); + concat_queue(&mq->free, &mq->cache_dirty); list_for_each_entry_safe(e, tmp, &mq->free, list) kmem_cache_free(mq_entry_cache, e); @@ -508,7 +512,8 @@ static void push(struct mq_policy *mq, struct entry *e) if (e->in_cache) { alloc_cblock(mq, e->cblock); - queue_push(&mq->cache, queue_level(e), &e->list); + queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, + queue_level(e), &e->list); } else queue_push(&mq->pre_cache, queue_level(e), &e->list); } @@ -558,7 +563,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e) * of the entries. * * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries of the first level). + * of the entries in the cache (the first 20 entries across all levels in + * ascending order, giving preference to the clean entries at each level). * * We can be much cleverer than this though. For example, each promotion * could bump up the threshold helping to prevent churn. Much more to do @@ -580,7 +586,16 @@ static void check_generation(struct mq_policy *mq) mq->generation++; for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache.qs + level; + head = mq->cache_clean.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + + head = mq->cache_dirty.qs + level; list_for_each_entry(e, head, list) { nr++; total += e->hit_count; @@ -633,19 +648,28 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) * - set the hit count to a hard coded value other than 1, eg, is it better * if it goes in at level 2? */ -static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock) { - dm_cblock_t result; - struct entry *demoted = pop(mq, &mq->cache); + struct entry *demoted = pop(mq, &mq->cache_clean); - BUG_ON(!demoted); - result = demoted->cblock; + if (!demoted) + /* + * We could get a block from mq->cache_dirty, but that + * would add extra latency to the triggering bio as it + * waits for the writeback. Better to not promote this + * time and hope there's a clean block next time this block + * is hit. + */ + return -ENOSPC; + + *cblock = demoted->cblock; *oblock = demoted->oblock; demoted->in_cache = false; + demoted->dirty = false; demoted->hit_count = 1; push(mq, demoted); - return result; + return 0; } /* @@ -705,11 +729,16 @@ static int cache_entry_found(struct mq_policy *mq, static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, struct policy_result *result) { + int r; dm_cblock_t cblock; if (find_free_cblock(mq, &cblock) == -ENOSPC) { result->op = POLICY_REPLACE; - cblock = demote_cblock(mq, &result->old_oblock); + r = demote_cblock(mq, &result->old_oblock, &cblock); + if (r) { + result->op = POLICY_MISS; + return 0; + } } else result->op = POLICY_NEW; @@ -717,6 +746,7 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, del(mq, e); e->in_cache = true; + e->dirty = false; push(mq, e); return 0; @@ -760,6 +790,7 @@ static void insert_in_pre_cache(struct mq_policy *mq, } e->in_cache = false; + e->dirty = false; e->oblock = oblock; e->hit_count = 1; e->generation = mq->generation; @@ -787,6 +818,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, e->oblock = oblock; e->cblock = cblock; e->in_cache = true; + e->dirty = false; e->hit_count = 1; e->generation = mq->generation; push(mq, e); @@ -917,6 +949,40 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return r; } +/* + * FIXME: __mq_set_clear_dirty can block due to mutex. + * Ideally a policy should not block in functions called + * from the map() function. Explore using RCU. + */ +static void __mq_set_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock, bool set) +{ + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + mutex_lock(&mq->lock); + e = hash_lookup(mq, oblock); + if (!e) + DMWARN("__mq_set_clear_dirty called for a block that isn't in the cache"); + else { + BUG_ON(!e->in_cache); + + del(mq, e); + e->dirty = set; + push(mq, e); + } + mutex_unlock(&mq->lock); +} + +static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + __mq_set_clear_dirty(p, oblock, true); +} + +static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + __mq_set_clear_dirty(p, oblock, false); +} + static int mq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid) @@ -931,6 +997,7 @@ static int mq_load_mapping(struct dm_cache_policy *p, e->cblock = cblock; e->oblock = oblock; e->in_cache = true; + e->dirty = false; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; e->generation = mq->generation; push(mq, e); @@ -949,7 +1016,14 @@ static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, mutex_lock(&mq->lock); for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache.qs[level], list) { + list_for_each_entry(e, &mq->cache_clean.qs[level], list) { + r = fn(context, e->cblock, e->oblock, e->hit_count); + if (r) + goto out; + } + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, &mq->cache_dirty.qs[level], list) { r = fn(context, e->cblock, e->oblock, e->hit_count); if (r) goto out; @@ -974,11 +1048,41 @@ static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) del(mq, e); e->in_cache = false; + e->dirty = false; push(mq, e); mutex_unlock(&mq->lock); } +static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + struct entry *e = pop(mq, &mq->cache_dirty); + + if (!e) + return -ENODATA; + + *oblock = e->oblock; + *cblock = e->cblock; + e->dirty = false; + push(mq, e); + + return 0; +} + +static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __mq_writeback_work(mq, oblock, cblock); + mutex_unlock(&mq->lock); + + return r; +} + static void force_mapping(struct mq_policy *mq, dm_oblock_t current_oblock, dm_oblock_t new_oblock) { @@ -988,6 +1092,7 @@ static void force_mapping(struct mq_policy *mq, del(mq, e); e->oblock = new_oblock; + e->dirty = true; push(mq, e); } @@ -1063,10 +1168,12 @@ static void init_policy_functions(struct mq_policy *mq) mq->policy.destroy = mq_destroy; mq->policy.map = mq_map; mq->policy.lookup = mq_lookup; + mq->policy.set_dirty = mq_set_dirty; + mq->policy.clear_dirty = mq_clear_dirty; mq->policy.load_mapping = mq_load_mapping; mq->policy.walk_mappings = mq_walk_mappings; mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.writeback_work = NULL; + mq->policy.writeback_work = mq_writeback_work; mq->policy.force_mapping = mq_force_mapping; mq->policy.residency = mq_residency; mq->policy.tick = mq_tick; @@ -1099,7 +1206,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->find_free_last_word = 0; queue_init(&mq->pre_cache); - queue_init(&mq->cache); + queue_init(&mq->cache_clean); + queue_init(&mq->cache_dirty); + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); mq->nr_entries = 2 * from_cblock(cache_size); From c86c30706caa02ffe303e6b87d53ef6a077d4cca Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:28 -0400 Subject: [PATCH 24/32] dm cache: be much more aggressive about promoting writes to discarded blocks Previously these promotions only got priority if there were unused cache blocks. Now we give them priority if there are any clean blocks in the cache. The fio_soak_test in the device-mapper-test-suite now gives uniform performance across subvolumes (~16 seconds). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 84 ++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 6710e038c730..444f0bf10b21 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -150,6 +150,21 @@ static void queue_init(struct queue *q) INIT_LIST_HEAD(q->qs + i); } +/* + * Checks to see if the queue is empty. + * FIXME: reduce cpu usage. + */ +static bool queue_empty(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + if (!list_empty(q->qs + i)) + return false; + + return true; +} + /* * Insert an entry to the back of the given level. */ @@ -442,6 +457,11 @@ static bool any_free_cblocks(struct mq_policy *mq) return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); } +static bool any_clean_cblocks(struct mq_policy *mq) +{ + return !queue_empty(&mq->cache_clean); +} + /* * Fills result out with a cache block that isn't in use, or return * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is @@ -688,17 +708,18 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { - if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) + if (data_dir == READ) + return mq->promote_threshold + READ_PROMOTE_THRESHOLD; + + if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { /* * We don't need to do any copying at all, so give this a - * very low threshold. In practice this only triggers - * during initial population after a format. + * very low threshold. */ return DISCARDED_PROMOTE_THRESHOLD; + } - return data_dir == READ ? - (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : - (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); + return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@ -772,6 +793,17 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, return r; } +static void insert_entry_in_pre_cache(struct mq_policy *mq, + struct entry *e, dm_oblock_t oblock) +{ + e->in_cache = false; + e->dirty = false; + e->oblock = oblock; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); +} + static void insert_in_pre_cache(struct mq_policy *mq, dm_oblock_t oblock) { @@ -789,30 +821,41 @@ static void insert_in_pre_cache(struct mq_policy *mq, return; } - e->in_cache = false; - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - e->generation = mq->generation; - push(mq, e); + insert_entry_in_pre_cache(mq, e, oblock); } static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, struct policy_result *result) { + int r; struct entry *e; dm_cblock_t cblock; if (find_free_cblock(mq, &cblock) == -ENOSPC) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } + r = demote_cblock(mq, &result->old_oblock, &cblock); + if (unlikely(r)) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } - e = alloc_entry(mq); - if (unlikely(!e)) { - result->op = POLICY_MISS; - return; + /* + * This will always succeed, since we've just demoted. + */ + e = pop(mq, &mq->pre_cache); + result->op = POLICY_REPLACE; + + } else { + e = alloc_entry(mq); + if (unlikely(!e)) + e = pop(mq, &mq->pre_cache); + + if (unlikely(!e)) { + result->op = POLICY_MISS; + return; + } + + result->op = POLICY_NEW; } e->oblock = oblock; @@ -823,7 +866,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, e->generation = mq->generation; push(mq, e); - result->op = POLICY_NEW; result->cblock = e->cblock; } From c9d28d5d09a0fd5f02f1321c8e18ff7d9f92270b Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 31 Oct 2013 13:55:48 -0400 Subject: [PATCH 25/32] dm cache: promotion optimisation for writes If a write block triggers promotion and covers a whole block we can avoid a copy. Introduce dm_{hook,unhook}_bio to simplify saving and restoring bio fields (bi_private is now used by overwrite). Switch writethrough support over to using these helpers too. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 93 +++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 4b564069e08f..655994fdf308 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits) /*----------------------------------------------------------------*/ +/* + * There are a couple of places where we let a bio run, but want to do some + * work before calling its endio function. We do this by temporarily + * changing the endio fn. + */ +struct dm_hook_info { + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, + bio_end_io_t *bi_end_io, void *bi_private) +{ + h->bi_end_io = bio->bi_end_io; + h->bi_private = bio->bi_private; + + bio->bi_end_io = bi_end_io; + bio->bi_private = bi_private; +} + +static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) +{ + bio->bi_end_io = h->bi_end_io; + bio->bi_private = h->bi_private; +} + +/*----------------------------------------------------------------*/ + #define PRISON_CELLS 1024 #define MIGRATION_POOL_SIZE 128 #define COMMIT_PERIOD HZ @@ -214,7 +242,7 @@ struct per_bio_data { */ struct cache *cache; dm_cblock_t cblock; - bio_end_io_t *saved_bi_end_io; + struct dm_hook_info hook_info; struct dm_bio_details bio_details; }; @@ -231,6 +259,7 @@ struct dm_cache_migration { bool writeback:1; bool demote:1; bool promote:1; + bool requeue_holder:1; struct dm_bio_prison_cell *old_ocell; struct dm_bio_prison_cell *new_ocell; @@ -666,7 +695,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) static void writethrough_endio(struct bio *bio, int err) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - bio->bi_end_io = pb->saved_bi_end_io; + + dm_unhook_bio(&pb->hook_info, bio); if (err) { bio_endio(bio, err); @@ -697,9 +727,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, pb->cache = cache; pb->cblock = cblock; - pb->saved_bi_end_io = bio->bi_end_io; + dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); dm_bio_record(&pb->bio_details, bio); - bio->bi_end_io = writethrough_endio; remap_to_origin_clear_discard(pb->cache, bio, oblock); } @@ -841,7 +870,12 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) cleanup_migration(mg); } else { - cell_defer(cache, mg->new_ocell, true); + if (mg->requeue_holder) + cell_defer(cache, mg->new_ocell, true); + else { + bio_endio(mg->new_ocell->holder, 0); + cell_defer(cache, mg->new_ocell, false); + } clear_dirty(cache, mg->new_oblock, mg->cblock); cleanup_migration(mg); } @@ -892,6 +926,42 @@ static void issue_copy_real(struct dm_cache_migration *mg) } } +static void overwrite_endio(struct bio *bio, int err) +{ + struct dm_cache_migration *mg = bio->bi_private; + struct cache *cache = mg->cache; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + unsigned long flags; + + if (err) + mg->err = true; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + dm_unhook_bio(&pb->hook_info, bio); + mg->requeue_holder = false; + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(mg->cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + generic_make_request(bio); +} + +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + static void avoid_copy(struct dm_cache_migration *mg) { atomic_inc(&mg->cache->stats.copies_avoided); @@ -906,9 +976,17 @@ static void issue_copy(struct dm_cache_migration *mg) if (mg->writeback || mg->demote) avoid = !is_dirty(cache, mg->cblock) || is_discarded_oblock(cache, mg->old_oblock); - else + else { + struct bio *bio = mg->new_ocell->holder; + avoid = is_discarded_oblock(cache, mg->new_oblock); + if (!avoid && bio_writes_complete_block(cache, bio)) { + issue_overwrite(mg, bio); + return; + } + } + avoid ? avoid_copy(mg) : issue_copy_real(mg); } @@ -998,6 +1076,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = false; mg->promote = true; + mg->requeue_holder = true; mg->cache = cache; mg->new_oblock = oblock; mg->cblock = cblock; @@ -1019,6 +1098,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->writeback = true; mg->demote = false; mg->promote = false; + mg->requeue_holder = true; mg->cache = cache; mg->old_oblock = oblock; mg->cblock = cblock; @@ -1042,6 +1122,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = true; mg->promote = true; + mg->requeue_holder = true; mg->cache = cache; mg->old_oblock = old_oblock; mg->new_oblock = new_oblock; From f494a9c6b1b6dd9a9f21bbb75d9210d478eeb498 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 31 Oct 2013 13:55:49 -0400 Subject: [PATCH 26/32] dm cache: cache shrinking support Allow a cache to shrink if the blocks being removed from the cache are not dirty. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 66 ++++++++++++++++++++++++++++++++++ drivers/md/dm-cache-target.c | 63 +++++++++++++++++++++++++++----- 2 files changed, 120 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 2262b4e57a28..062b83ed3e84 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -667,19 +667,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) kfree(cmd); } +/* + * Checks that the given cache block is either unmapped or clean. + */ +static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) +{ + int r; + __le64 value; + dm_oblock_t ob; + unsigned flags; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); + if (r) { + DMERR("block_unmapped_or_clean failed"); + return r; + } + + unpack_value(value, &ob, &flags); + *result = !((flags & M_VALID) && (flags & M_DIRTY)); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + *result = true; + + while (begin != end) { + r = block_unmapped_or_clean(cmd, begin, result); + if (r) + return r; + + if (!*result) { + DMERR("cache block %llu is dirty", + (unsigned long long) from_cblock(begin)); + return 0; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + return 0; +} + int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { int r; + bool clean; __le64 null_mapping = pack_value(0, 0); down_write(&cmd->root_lock); __dm_bless_for_disk(&null_mapping); + + if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) { + r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean); + if (r) { + __dm_unbless_for_disk(&null_mapping); + goto out; + } + + if (!clean) { + DMERR("unable to shrink cache due to dirty blocks"); + r = -EINVAL; + __dm_unbless_for_disk(&null_mapping); + goto out; + } + } + r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), &null_mapping, &cmd->root); if (!r) cmd->cache_blocks = new_cache_size; cmd->changed = true; + +out: up_write(&cmd->root_lock); return r; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 655994fdf308..183dfc9db297 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2502,26 +2502,71 @@ static int load_discard(void *context, sector_t discard_block_size, return 0; } +static dm_cblock_t get_cache_dev_size(struct cache *cache) +{ + sector_t size = get_dev_size(cache->cache_dev); + (void) sector_div(size, cache->sectors_per_block); + return to_cblock(size); +} + +static bool can_resize(struct cache *cache, dm_cblock_t new_size) +{ + if (from_cblock(new_size) > from_cblock(cache->cache_size)) + return true; + + /* + * We can't drop a dirty block when shrinking the cache. + */ + while (from_cblock(new_size) < from_cblock(cache->cache_size)) { + new_size = to_cblock(from_cblock(new_size) + 1); + if (is_dirty(cache, new_size)) { + DMERR("unable to shrink cache; cache block %llu is dirty", + (unsigned long long) from_cblock(new_size)); + return false; + } + } + + return true; +} + +static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) +{ + int r; + + r = dm_cache_resize(cache->cmd, cache->cache_size); + if (r) { + DMERR("could not resize cache metadata"); + return r; + } + + cache->cache_size = new_size; + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; - sector_t actual_cache_size = get_dev_size(cache->cache_dev); - (void) sector_div(actual_cache_size, cache->sectors_per_block); + dm_cblock_t csize = get_cache_dev_size(cache); /* * Check to see if the cache has resized. */ - if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { - cache->cache_size = to_cblock(actual_cache_size); - - r = dm_cache_resize(cache->cmd, cache->cache_size); - if (r) { - DMERR("could not resize cache metadata"); + if (!cache->sized) { + r = resize_cache_dev(cache, csize); + if (r) return r; - } cache->sized = true; + + } else if (csize != cache->cache_size) { + if (!can_resize(cache, csize)) + return -EINVAL; + + r = resize_cache_dev(cache, csize); + if (r) + return r; } if (!cache->loaded_mappings) { From 2ee57d587357f0d752af6c2e3e46434a74b1bee3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 24 Oct 2013 14:10:29 -0400 Subject: [PATCH 27/32] dm cache: add passthrough mode "Passthrough" is a dm-cache operating mode (like writethrough or writeback) which is intended to be used when the cache contents are not known to be coherent with the origin device. It behaves as follows: * All reads are served from the origin device (all reads miss the cache) * All writes are forwarded to the origin device; additionally, write hits cause cache block invalidates This mode decouples cache coherency checks from cache device creation, largely to avoid having to perform coherency checks while booting. Boot scripts can create cache devices in passthrough mode and put them into service (mount cached filesystems, for example) without having to worry about coherency. Coherency that exists is maintained, although the cache will gradually cool as writes take place. Later, applications can perform coherency checks, the nature of which will depend on the type of the underlying storage. If coherency can be verified, the cache device can be transitioned to writethrough or writeback mode while still warm; otherwise, the cache contents can be discarded prior to transitioning to the desired operating mode. Signed-off-by: Joe Thornber Signed-off-by: Heinz Mauelshagen Signed-off-by: Morgan Mears Signed-off-by: Mike Snitzer --- Documentation/device-mapper/cache.txt | 19 ++- drivers/md/dm-cache-metadata.c | 5 + drivers/md/dm-cache-metadata.h | 5 + drivers/md/dm-cache-target.c | 209 +++++++++++++++++++++----- 4 files changed, 200 insertions(+), 38 deletions(-) diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index 33d45ee0b737..ff6639f72536 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -68,10 +68,11 @@ So large block sizes are bad because they waste cache space. And small block sizes are bad because they increase the amount of metadata (both in core and on disk). -Writeback/writethrough ----------------------- +Cache operating modes +--------------------- -The cache has two modes, writeback and writethrough. +The cache has three operating modes: writeback, writethrough and +passthrough. If writeback, the default, is selected then a write to a block that is cached will go only to the cache and the block will be marked dirty in @@ -81,6 +82,18 @@ If writethrough is selected then a write to a cached block will not complete until it has hit both the origin and cache devices. Clean blocks should remain clean. +If passthrough is selected, useful when the cache contents are not known +to be coherent with the origin device, then all reads are served from +the origin device (all reads miss the cache) and all writes are +forwarded to the origin device; additionally, write hits cause cache +block invalidates. Passthrough mode allows a cache device to be +activated without having to worry about coherency. Coherency that +exists is maintained, although the cache will gradually cool as writes +take place. If the coherency of the cache can later be verified, or +established, the cache device can can be transitioned to writethrough or +writeback mode while still warm. Otherwise, the cache contents can be +discarded prior to transitioning to the desired operating mode. + A simple cleaner policy is provided, which will clean (write back) all dirty blocks in a cache. Useful for decommissioning a cache. diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 062b83ed3e84..8601425436cd 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1249,3 +1249,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, return r; } + +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) +{ + return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index f45cef21f3d0..cd906f14f98d 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, uint32_t hint); +/* + * Query method. Are all the blocks in the cache clean? + */ +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); + /*----------------------------------------------------------------*/ #endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 183dfc9db297..8c0217753cc5 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -104,14 +104,37 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) /* * FIXME: the cache is read/write for the time being. */ -enum cache_mode { +enum cache_metadata_mode { CM_WRITE, /* metadata may be changed */ CM_READ_ONLY, /* metadata may not be changed */ }; +enum cache_io_mode { + /* + * Data is written to cached blocks only. These blocks are marked + * dirty. If you lose the cache device you will lose data. + * Potential performance increase for both reads and writes. + */ + CM_IO_WRITEBACK, + + /* + * Data is written to both cache and origin. Blocks are never + * dirty. Potential performance benfit for reads only. + */ + CM_IO_WRITETHROUGH, + + /* + * A degraded mode useful for various cache coherency situations + * (eg, rolling back snapshots). Reads and writes always go to the + * origin. If a write goes to a cached oblock, then the cache + * block is invalidated. + */ + CM_IO_PASSTHROUGH +}; + struct cache_features { - enum cache_mode mode; - bool write_through:1; + enum cache_metadata_mode mode; + enum cache_io_mode io_mode; }; struct cache_stats { @@ -565,9 +588,24 @@ static void save_stats(struct cache *cache) #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) +static bool writethrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} + +static bool passthrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_PASSTHROUGH; +} + static size_t get_per_bio_data_size(struct cache *cache) { - return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; } static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) @@ -1135,6 +1173,32 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, quiesce_migration(mg); } +/* + * Invalidate a cache entry. No writeback occurs; any changes in the cache + * block are thrown away. + */ +static void invalidate(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = true; + mg->promote = false; + mg->requeue_holder = true; + mg->cache = cache; + mg->old_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = cell; + mg->new_ocell = NULL; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + /*---------------------------------------------------------------- * bio processing *--------------------------------------------------------------*/ @@ -1197,13 +1261,6 @@ static bool spare_migration_bandwidth(struct cache *cache) return current_volume < cache->migration_threshold; } -static bool is_writethrough_io(struct cache *cache, struct bio *bio, - dm_cblock_t cblock) -{ - return bio_data_dir(bio) == WRITE && - cache->features.write_through && !is_dirty(cache, cblock); -} - static void inc_hit_counter(struct cache *cache, struct bio *bio) { atomic_inc(bio_data_dir(bio) == READ ? @@ -1216,6 +1273,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) &cache->stats.read_miss : &cache->stats.write_miss); } +static void issue_cache_bio(struct cache *cache, struct bio *bio, + struct per_bio_data *pb, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_cache_dirty(cache, bio, oblock, cblock); + issue(cache, bio); +} + static void process_bio(struct cache *cache, struct prealloc *structs, struct bio *bio) { @@ -1227,7 +1293,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); - bool can_migrate = discarded_block || spare_migration_bandwidth(cache); + bool passthrough = passthrough_mode(&cache->features); + bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); /* * Check to see if that block is currently migrating. @@ -1248,15 +1315,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs, switch (lookup_result.op) { case POLICY_HIT: - inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + if (passthrough) { + inc_miss_counter(cache, bio); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - else - remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + /* + * Passthrough always maps to the origin, + * invalidating any cache blocks that are written + * to. + */ + + if (bio_data_dir(bio) == WRITE) { + atomic_inc(&cache->stats.demotion); + invalidate(cache, structs, block, lookup_result.cblock, new_ocell); + release_cell = false; + + } else { + /* FIXME: factor out issue_origin() */ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); + } + } else { + inc_hit_counter(cache, bio); + + if (bio_data_dir(bio) == WRITE && + writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + issue(cache, bio); + } else + issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + } - issue(cache, bio); break; case POLICY_MISS: @@ -1807,7 +1898,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, static void init_features(struct cache_features *cf) { cf->mode = CM_WRITE; - cf->write_through = false; + cf->io_mode = CM_IO_WRITEBACK; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, @@ -1832,10 +1923,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, arg = dm_shift_arg(as); if (!strcasecmp(arg, "writeback")) - cf->write_through = false; + cf->io_mode = CM_IO_WRITEBACK; else if (!strcasecmp(arg, "writethrough")) - cf->write_through = true; + cf->io_mode = CM_IO_WRITETHROUGH; + + else if (!strcasecmp(arg, "passthrough")) + cf->io_mode = CM_IO_PASSTHROUGH; else { *error = "Unrecognised cache feature requested"; @@ -2088,6 +2182,22 @@ static int cache_create(struct cache_args *ca, struct cache **result) } cache->cmd = cmd; + if (passthrough_mode(&cache->features)) { + bool all_clean; + + r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); + if (r) { + *error = "dm_cache_metadata_all_clean() failed"; + goto bad; + } + + if (!all_clean) { + *error = "Cannot enter passthrough mode unless all blocks are clean"; + r = -EINVAL; + goto bad; + } + } + spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); @@ -2303,17 +2413,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + r = DM_MAPIO_REMAPPED; switch (lookup_result.op) { case POLICY_HIT: - inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + if (passthrough_mode(&cache->features)) { + if (bio_data_dir(bio) == WRITE) { + /* + * We need to invalidate this block, so + * defer for the worker thread. + */ + cell_defer(cache, cell, true); + r = DM_MAPIO_SUBMITTED; - if (is_writethrough_io(cache, bio, lookup_result.cblock)) - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - else - remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + } else { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + inc_miss_counter(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); + cell_defer(cache, cell, false); + } + + } else { + inc_hit_counter(cache, bio); + + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + + cell_defer(cache, cell, false); + } break; case POLICY_MISS: @@ -2338,10 +2468,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio) DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, (unsigned) lookup_result.op); bio_io_error(bio); - return DM_MAPIO_SUBMITTED; + r = DM_MAPIO_SUBMITTED; } - return DM_MAPIO_REMAPPED; + return r; } static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) @@ -2659,10 +2789,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned long long) from_cblock(residency), cache->nr_dirty); - if (cache->features.write_through) + if (writethrough_mode(&cache->features)) DMEMIT("1 writethrough "); - else - DMEMIT("0 "); + + else if (passthrough_mode(&cache->features)) + DMEMIT("1 passthrough "); + + else if (writeback_mode(&cache->features)) + DMEMIT("1 writeback "); + + else { + DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); + goto err; + } DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); if (sz < maxlen) { @@ -2771,7 +2910,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, From 53d498198d3e8bce4287112beafc30befcba98cc Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 16 Oct 2013 10:59:26 +0100 Subject: [PATCH 28/32] dm cache metadata: check the metadata version when reading the superblock Need to check the version to verify on-disk metadata is supported. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 8601425436cd..9ef0752e8a08 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -20,7 +20,13 @@ #define CACHE_SUPERBLOCK_MAGIC 06142003 #define CACHE_SUPERBLOCK_LOCATION 0 -#define CACHE_VERSION 1 + +/* + * defines a range of metadata versions that this module can handle. + */ +#define MIN_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 1 + #define CACHE_METADATA_CACHE_SIZE 64 /* @@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v, SUPERBLOCK_CSUM_XOR)); } +static int check_metadata_version(struct cache_disk_superblock *disk_super) +{ + uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { + DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); + return -EINVAL; + } + + return 0; +} + static int sb_check(struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) @@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v, return -EILSEQ; } - return 0; + return check_metadata_version(disk_super); } static struct dm_block_validator sb_validator = { @@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->flags = 0; memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(CACHE_VERSION); + disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; From 633618e3353f8953e43d989d08302f5dcd51d8be Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Sat, 9 Nov 2013 11:12:51 +0000 Subject: [PATCH 29/32] dm cache policy mq: reduce memory requirements Rather than storing the cblock in each cache entry, we allocate all entries in an array and infer the cblock from the entry position. Saves 4 bytes of memory per cache block. In addition, this gives us an easy way of looking up cache entries by cblock. We no longer need to keep an explicit bitset to track which cblocks have been allocated. And no searching is needed to find free cblocks. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-mq.c | 547 ++++++++++++++------------------ 1 file changed, 233 insertions(+), 314 deletions(-) diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 444f0bf10b21..782bf854666a 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min) /*----------------------------------------------------------------*/ -static unsigned long *alloc_bitset(unsigned nr_entries) -{ - size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); - return vzalloc(s); -} - -static void free_bitset(unsigned long *bits) -{ - vfree(bits); -} - -/*----------------------------------------------------------------*/ - /* * Large, sequential ios are probably better left on the origin device since * spindles tend to have good bandwidth. @@ -233,18 +220,107 @@ struct entry { struct hlist_node hlist; struct list_head list; dm_oblock_t oblock; - dm_cblock_t cblock; /* valid iff in_cache */ /* * FIXME: pack these better */ - bool in_cache:1; bool dirty:1; unsigned hit_count; unsigned generation; unsigned tick; }; +/* + * Rather than storing the cblock in an entry, we allocate all entries in + * an array, and infer the cblock from the entry position. + * + * Free entries are linked together into a list. + */ +struct entry_pool { + struct entry *entries, *entries_end; + struct list_head free; + unsigned nr_allocated; +}; + +static int epool_init(struct entry_pool *ep, unsigned nr_entries) +{ + unsigned i; + + ep->entries = vzalloc(sizeof(struct entry) * nr_entries); + if (!ep->entries) + return -ENOMEM; + + ep->entries_end = ep->entries + nr_entries; + + INIT_LIST_HEAD(&ep->free); + for (i = 0; i < nr_entries; i++) + list_add(&ep->entries[i].list, &ep->free); + + ep->nr_allocated = 0; + + return 0; +} + +static void epool_exit(struct entry_pool *ep) +{ + vfree(ep->entries); +} + +static struct entry *alloc_entry(struct entry_pool *ep) +{ + struct entry *e; + + if (list_empty(&ep->free)) + return NULL; + + e = list_entry(list_pop(&ep->free), struct entry, list); + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +/* + * This assumes the cblock hasn't already been allocated. + */ +static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + list_del(&e->list); + + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +static void free_entry(struct entry_pool *ep, struct entry *e) +{ + BUG_ON(!ep->nr_allocated); + ep->nr_allocated--; + INIT_HLIST_NODE(&e->hlist); + list_add(&e->list, &ep->free); +} + +static bool epool_empty(struct entry_pool *ep) +{ + return list_empty(&ep->free); +} + +static bool in_pool(struct entry_pool *ep, struct entry *e) +{ + return e >= ep->entries && e < ep->entries_end; +} + +static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) +{ + return to_cblock(e - ep->entries); +} + +/*----------------------------------------------------------------*/ + struct mq_policy { struct dm_cache_policy policy; @@ -253,6 +329,13 @@ struct mq_policy { dm_cblock_t cache_size; struct io_tracker tracker; + /* + * Entries come from two pools, one of pre-cache entries, and one + * for the cache proper. + */ + struct entry_pool pre_cache_pool; + struct entry_pool cache_pool; + /* * We maintain three queues of entries. The cache proper, * consisting of a clean and dirty queue, contains the currently @@ -299,25 +382,6 @@ struct mq_policy { */ unsigned promote_threshold; - /* - * We need cache_size entries for the cache, and choose to have - * cache_size entries for the pre_cache too. One motivation for - * using the same size is to make the hit counts directly - * comparable between pre_cache and cache. - */ - unsigned nr_entries; - unsigned nr_entries_allocated; - struct list_head free; - - /* - * Cache blocks may be unallocated. We store this info in a - * bitset. - */ - unsigned long *allocation_bitset; - unsigned nr_cblocks_allocated; - unsigned find_free_nr_words; - unsigned find_free_last_word; - /* * The hash table allows us to quickly find an entry by origin * block. Both pre_cache and cache entries are in here. @@ -327,50 +391,6 @@ struct mq_policy { struct hlist_head *table; }; -/*----------------------------------------------------------------*/ -/* Free/alloc mq cache entry structures. */ -static void concat_queue(struct list_head *lh, struct queue *q) -{ - unsigned level; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_splice(q->qs + level, lh); -} - -static void free_entries(struct mq_policy *mq) -{ - struct entry *e, *tmp; - - concat_queue(&mq->free, &mq->pre_cache); - concat_queue(&mq->free, &mq->cache_clean); - concat_queue(&mq->free, &mq->cache_dirty); - - list_for_each_entry_safe(e, tmp, &mq->free, list) - kmem_cache_free(mq_entry_cache, e); -} - -static int alloc_entries(struct mq_policy *mq, unsigned elts) -{ - unsigned u = mq->nr_entries; - - INIT_LIST_HEAD(&mq->free); - mq->nr_entries_allocated = 0; - - while (u--) { - struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); - - if (!e) { - free_entries(mq); - return -ENOMEM; - } - - - list_add(&e->list, &mq->free); - } - - return 0; -} - /*----------------------------------------------------------------*/ /* @@ -407,54 +427,9 @@ static void hash_remove(struct entry *e) /*----------------------------------------------------------------*/ -/* - * Allocates a new entry structure. The memory is allocated in one lump, - * so we just handing it out here. Returns NULL if all entries have - * already been allocated. Cannot fail otherwise. - */ -static struct entry *alloc_entry(struct mq_policy *mq) -{ - struct entry *e; - - if (mq->nr_entries_allocated >= mq->nr_entries) { - BUG_ON(!list_empty(&mq->free)); - return NULL; - } - - e = list_entry(list_pop(&mq->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - - mq->nr_entries_allocated++; - return e; -} - -/*----------------------------------------------------------------*/ - -/* - * Mark cache blocks allocated or not in the bitset. - */ -static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); - - set_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated++; -} - -static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); - - clear_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated--; -} - static bool any_free_cblocks(struct mq_policy *mq) { - return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); + return !epool_empty(&mq->cache_pool); } static bool any_clean_cblocks(struct mq_policy *mq) @@ -462,48 +437,6 @@ static bool any_clean_cblocks(struct mq_policy *mq) return !queue_empty(&mq->cache_clean); } -/* - * Fills result out with a cache block that isn't in use, or return - * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is - * reponsible for that. - */ -static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, - dm_cblock_t *result, unsigned *last_word) -{ - int r = -ENOSPC; - unsigned w; - - for (w = begin; w < end; w++) { - /* - * ffz is undefined if no zero exists - */ - if (mq->allocation_bitset[w] != ~0UL) { - *last_word = w; - *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); - if (from_cblock(*result) < from_cblock(mq->cache_size)) - r = 0; - - break; - } - } - - return r; -} - -static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) -{ - int r; - - if (!any_free_cblocks(mq)) - return -ENOSPC; - - r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); - if (r == -ENOSPC && mq->find_free_last_word) - r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); - - return r; -} - /*----------------------------------------------------------------*/ /* @@ -520,34 +453,35 @@ static unsigned queue_level(struct entry *e) return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); } +static bool in_cache(struct mq_policy *mq, struct entry *e) +{ + return in_pool(&mq->cache_pool, e); +} + /* * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. Sets the - * tick which records when the entry was last moved about. + * block is marked as allocated if necc. Inserts into the hash table. + * Sets the tick which records when the entry was last moved about. */ static void push(struct mq_policy *mq, struct entry *e) { e->tick = mq->tick; hash_insert(mq, e); - if (e->in_cache) { - alloc_cblock(mq, e->cblock); + if (in_cache(mq, e)) queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, queue_level(e), &e->list); - } else + else queue_push(&mq->pre_cache, queue_level(e), &e->list); } /* * Removes an entry from pre_cache or cache. Removes from the hash table. - * Frees off the cache block if necc. */ static void del(struct mq_policy *mq, struct entry *e) { queue_remove(&e->list); hash_remove(e); - if (e->in_cache) - free_cblock(mq, e->cblock); } /* @@ -564,8 +498,6 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q) e = container_of(h, struct entry, list); hash_remove(e); - if (e->in_cache) - free_cblock(mq, e->cblock); return e; } @@ -599,9 +531,7 @@ static void check_generation(struct mq_policy *mq) struct list_head *head; struct entry *e; - if ((mq->hit_count >= mq->generation_period) && - (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { - + if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { mq->hit_count = 0; mq->generation++; @@ -668,7 +598,7 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) * - set the hit count to a hard coded value other than 1, eg, is it better * if it goes in at level 2? */ -static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock) +static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) { struct entry *demoted = pop(mq, &mq->cache_clean); @@ -682,12 +612,14 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t */ return -ENOSPC; - *cblock = demoted->cblock; *oblock = demoted->oblock; - demoted->in_cache = false; - demoted->dirty = false; - demoted->hit_count = 1; - push(mq, demoted); + free_entry(&mq->cache_pool, demoted); + + /* + * We used to put the demoted block into the pre-cache, but I think + * it's simpler to just let it work it's way up from zero again. + * Stops blocks flickering in and out of the cache. + */ return 0; } @@ -735,9 +667,9 @@ static int cache_entry_found(struct mq_policy *mq, { requeue_and_update_tick(mq, e); - if (e->in_cache) { + if (in_cache(mq, e)) { result->op = POLICY_HIT; - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } return 0; @@ -751,11 +683,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, struct policy_result *result) { int r; - dm_cblock_t cblock; + struct entry *new_e; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { + /* Ensure there's a free cblock in the cache */ + if (epool_empty(&mq->cache_pool)) { result->op = POLICY_REPLACE; - r = demote_cblock(mq, &result->old_oblock, &cblock); + r = demote_cblock(mq, &result->old_oblock); if (r) { result->op = POLICY_MISS; return 0; @@ -763,12 +696,20 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, } else result->op = POLICY_NEW; - result->cblock = e->cblock = cblock; + new_e = alloc_entry(&mq->cache_pool); + BUG_ON(!new_e); + + new_e->oblock = e->oblock; + new_e->dirty = false; + new_e->hit_count = e->hit_count; + new_e->generation = e->generation; + new_e->tick = e->tick; del(mq, e); - e->in_cache = true; - e->dirty = false; - push(mq, e); + free_entry(&mq->pre_cache_pool, e); + push(mq, new_e); + + result->cblock = infer_cblock(&mq->cache_pool, new_e); return 0; } @@ -793,21 +734,10 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, return r; } -static void insert_entry_in_pre_cache(struct mq_policy *mq, - struct entry *e, dm_oblock_t oblock) -{ - e->in_cache = false; - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - e->generation = mq->generation; - push(mq, e); -} - static void insert_in_pre_cache(struct mq_policy *mq, dm_oblock_t oblock) { - struct entry *e = alloc_entry(mq); + struct entry *e = alloc_entry(&mq->pre_cache_pool); if (!e) /* @@ -821,7 +751,11 @@ static void insert_in_pre_cache(struct mq_policy *mq, return; } - insert_entry_in_pre_cache(mq, e, oblock); + e->dirty = false; + e->oblock = oblock; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); } static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, @@ -829,10 +763,10 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, { int r; struct entry *e; - dm_cblock_t cblock; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { - r = demote_cblock(mq, &result->old_oblock, &cblock); + if (epool_empty(&mq->cache_pool)) { + result->op = POLICY_REPLACE; + r = demote_cblock(mq, &result->old_oblock); if (unlikely(r)) { result->op = POLICY_MISS; insert_in_pre_cache(mq, oblock); @@ -842,31 +776,21 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, /* * This will always succeed, since we've just demoted. */ - e = pop(mq, &mq->pre_cache); - result->op = POLICY_REPLACE; + e = alloc_entry(&mq->cache_pool); + BUG_ON(!e); } else { - e = alloc_entry(mq); - if (unlikely(!e)) - e = pop(mq, &mq->pre_cache); - - if (unlikely(!e)) { - result->op = POLICY_MISS; - return; - } - + e = alloc_entry(&mq->cache_pool); result->op = POLICY_NEW; } e->oblock = oblock; - e->cblock = cblock; - e->in_cache = true; e->dirty = false; e->hit_count = 1; e->generation = mq->generation; push(mq, e); - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, @@ -897,13 +821,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock, int r = 0; struct entry *e = hash_lookup(mq, oblock); - if (e && e->in_cache) + if (e && in_cache(mq, e)) r = cache_entry_found(mq, e, result); + else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) result->op = POLICY_MISS; + else if (e) r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, data_dir, result); + else r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, data_dir, result); @@ -930,9 +857,9 @@ static void mq_destroy(struct dm_cache_policy *p) { struct mq_policy *mq = to_mq_policy(p); - free_bitset(mq->allocation_bitset); kfree(mq->table); - free_entries(mq); + epool_exit(&mq->cache_pool); + epool_exit(&mq->pre_cache_pool); kfree(mq); } @@ -980,8 +907,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return -EWOULDBLOCK; e = hash_lookup(mq, oblock); - if (e && e->in_cache) { - *cblock = e->cblock; + if (e && in_cache(mq, e)) { + *cblock = infer_cblock(&mq->cache_pool, e); r = 0; } else r = -ENOENT; @@ -991,38 +918,34 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return r; } -/* - * FIXME: __mq_set_clear_dirty can block due to mutex. - * Ideally a policy should not block in functions called - * from the map() function. Explore using RCU. - */ -static void __mq_set_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock, bool set) +static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) { - struct mq_policy *mq = to_mq_policy(p); struct entry *e; - mutex_lock(&mq->lock); e = hash_lookup(mq, oblock); - if (!e) - DMWARN("__mq_set_clear_dirty called for a block that isn't in the cache"); - else { - BUG_ON(!e->in_cache); + BUG_ON(!e || !in_cache(mq, e)); - del(mq, e); - e->dirty = set; - push(mq, e); - } - mutex_unlock(&mq->lock); + del(mq, e); + e->dirty = set; + push(mq, e); } static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) { - __mq_set_clear_dirty(p, oblock, true); + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, true); + mutex_unlock(&mq->lock); } static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) { - __mq_set_clear_dirty(p, oblock, false); + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, false); + mutex_unlock(&mq->lock); } static int mq_load_mapping(struct dm_cache_policy *p, @@ -1032,13 +955,8 @@ static int mq_load_mapping(struct dm_cache_policy *p, struct mq_policy *mq = to_mq_policy(p); struct entry *e; - e = alloc_entry(mq); - if (!e) - return -ENOMEM; - - e->cblock = cblock; + e = alloc_particular_entry(&mq->cache_pool, cblock); e->oblock = oblock; - e->in_cache = true; e->dirty = false; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; e->generation = mq->generation; @@ -1047,52 +965,58 @@ static int mq_load_mapping(struct dm_cache_policy *p, return 0; } +static int mq_save_hints(struct mq_policy *mq, struct queue *q, + policy_walk_fn fn, void *context) +{ + int r; + unsigned level; + struct entry *e; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, q->qs + level, list) { + r = fn(context, infer_cblock(&mq->cache_pool, e), + e->oblock, e->hit_count); + if (r) + return r; + } + + return 0; +} + static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, void *context) { struct mq_policy *mq = to_mq_policy(p); int r = 0; - struct entry *e; - unsigned level; mutex_lock(&mq->lock); - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache_clean.qs[level], list) { - r = fn(context, e->cblock, e->oblock, e->hit_count); - if (r) - goto out; - } + r = mq_save_hints(mq, &mq->cache_clean, fn, context); + if (!r) + r = mq_save_hints(mq, &mq->cache_dirty, fn, context); - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache_dirty.qs[level], list) { - r = fn(context, e->cblock, e->oblock, e->hit_count); - if (r) - goto out; - } - -out: mutex_unlock(&mq->lock); return r; } +static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + free_entry(&mq->cache_pool, e); +} + static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { struct mq_policy *mq = to_mq_policy(p); - struct entry *e; mutex_lock(&mq->lock); - - e = hash_lookup(mq, oblock); - - BUG_ON(!e || !e->in_cache); - - del(mq, e); - e->in_cache = false; - e->dirty = false; - push(mq, e); - + __remove_mapping(mq, oblock); mutex_unlock(&mq->lock); } @@ -1105,7 +1029,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, return -ENODATA; *oblock = e->oblock; - *cblock = e->cblock; + *cblock = infer_cblock(&mq->cache_pool, e); e->dirty = false; push(mq, e); @@ -1125,17 +1049,17 @@ static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, return r; } -static void force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static void __force_mapping(struct mq_policy *mq, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) { struct entry *e = hash_lookup(mq, current_oblock); - BUG_ON(!e || !e->in_cache); - - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); + if (e && in_cache(mq, e)) { + del(mq, e); + e->oblock = new_oblock; + e->dirty = true; + push(mq, e); + } } static void mq_force_mapping(struct dm_cache_policy *p, @@ -1144,7 +1068,7 @@ static void mq_force_mapping(struct dm_cache_policy *p, struct mq_policy *mq = to_mq_policy(p); mutex_lock(&mq->lock); - force_mapping(mq, current_oblock, new_oblock); + __force_mapping(mq, current_oblock, new_oblock); mutex_unlock(&mq->lock); } @@ -1154,7 +1078,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p) struct mq_policy *mq = to_mq_policy(p); mutex_lock(&mq->lock); - r = to_cblock(mq->nr_cblocks_allocated); + r = to_cblock(mq->cache_pool.nr_allocated); mutex_unlock(&mq->lock); return r; @@ -1227,7 +1151,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - int r; struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) @@ -1235,8 +1158,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, init_policy_functions(mq); iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; + + if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of pre-cache entries"); + goto bad_pre_cache_init; + } + + if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of cache entries"); + goto bad_cache_init; + } + mq->tick_protected = 0; mq->tick = 0; mq->hit_count = 0; @@ -1244,8 +1177,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->promote_threshold = 0; mutex_init(&mq->lock); spin_lock_init(&mq->tick_lock); - mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); - mq->find_free_last_word = 0; queue_init(&mq->pre_cache); queue_init(&mq->cache_clean); @@ -1253,31 +1184,19 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); - mq->nr_entries = 2 * from_cblock(cache_size); - r = alloc_entries(mq, mq->nr_entries); - if (r) - goto bad_cache_alloc; - - mq->nr_entries_allocated = 0; - mq->nr_cblocks_allocated = 0; - mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); mq->hash_bits = ffs(mq->nr_buckets) - 1; mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); if (!mq->table) goto bad_alloc_table; - mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); - if (!mq->allocation_bitset) - goto bad_alloc_bitset; - return &mq->policy; -bad_alloc_bitset: - kfree(mq->table); bad_alloc_table: - free_entries(mq); -bad_cache_alloc: + epool_exit(&mq->cache_pool); +bad_cache_init: + epool_exit(&mq->pre_cache_pool); +bad_pre_cache_init: kfree(mq); return NULL; @@ -1287,7 +1206,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1295,7 +1214,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create From 532906aa7f9656209f30f08dfadd328fc1bc6912 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 8 Nov 2013 16:36:17 +0000 Subject: [PATCH 30/32] dm cache: add remove_cblock method to policy interface Implement policy_remove_cblock() and add remove_cblock method to the mq policy. These methods will be used by the following cache block invalidation patch which adds the 'invalidate_cblocks' message to the cache core. Also, update some comments in dm-cache-policy.h Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-internal.h | 5 ++++ drivers/md/dm-cache-policy-mq.c | 35 +++++++++++++++++++++++++++ drivers/md/dm-cache-policy.h | 21 +++++++++++++--- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index a75f7e7498eb..2256a1f24f73 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -64,6 +64,11 @@ static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t p->remove_mapping(p, oblock); } +static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + return p->remove_cblock(p, cblock); +} + static inline void policy_force_mapping(struct dm_cache_policy *p, dm_oblock_t current_oblock, dm_oblock_t new_oblock) { diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 782bf854666a..7209fab8b8ed 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -304,6 +304,15 @@ static void free_entry(struct entry_pool *ep, struct entry *e) list_add(&e->list, &ep->free); } +/* + * Returns NULL if the entry is free. + */ +static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + return e->hlist.pprev ? e : NULL; +} + static bool epool_empty(struct entry_pool *ep) { return list_empty(&ep->free); @@ -1020,6 +1029,31 @@ static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) mutex_unlock(&mq->lock); } +static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) +{ + struct entry *e = epool_find(&mq->cache_pool, cblock); + + if (!e) + return -ENODATA; + + del(mq, e); + free_entry(&mq->cache_pool, e); + + return 0; +} + +static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __remove_cblock(mq, cblock); + mutex_unlock(&mq->lock); + + return r; +} + static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock) { @@ -1139,6 +1173,7 @@ static void init_policy_functions(struct mq_policy *mq) mq->policy.load_mapping = mq_load_mapping; mq->policy.walk_mappings = mq_walk_mappings; mq->policy.remove_mapping = mq_remove_mapping; + mq->policy.remove_cblock = mq_remove_cblock; mq->policy.writeback_work = mq_writeback_work; mq->policy.force_mapping = mq_force_mapping; mq->policy.residency = mq_residency; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 33369ca9614f..052c00a84a5c 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -135,9 +135,6 @@ struct dm_cache_policy { */ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); - /* - * oblock must be a mapped block. Must not block. - */ void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); @@ -159,8 +156,24 @@ struct dm_cache_policy { void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, dm_oblock_t new_oblock); - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); + /* + * This is called via the invalidate_cblocks message. It is + * possible the particular cblock has already been removed due to a + * write io in passthrough mode. In which case this should return + * -ENODATA. + */ + int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); + /* + * Provide a dirty block to be written back by the core target. + * + * Returns: + * + * 0 and @cblock,@oblock: block to write back provided + * + * -ENODATA: no dirty blocks available + */ + int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); /* * How full is the cache? From 65790ff919e2e07ccb4457415c11075b245d643b Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 8 Nov 2013 16:39:50 +0000 Subject: [PATCH 31/32] dm cache: add cache block invalidation support Cache block invalidation is removing an entry from the cache without writing it back. Cache blocks can be invalidated via the 'invalidate_cblocks' message, which takes an arbitrary number of cblock ranges: invalidate_cblocks [|-]* E.g. dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- Documentation/device-mapper/cache.txt | 12 +- drivers/md/dm-cache-target.c | 225 +++++++++++++++++++++++++- 2 files changed, 233 insertions(+), 4 deletions(-) diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index ff6639f72536..fc9d2dfb9415 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -244,12 +244,22 @@ The message format is: E.g. dmsetup message my_cache 0 sequential_threshold 1024 + +Invalidation is removing an entry from the cache without writing it +back. Cache blocks can be invalidated via the invalidate_cblocks +message, which takes an arbitrary number of cblock ranges. + + invalidate_cblocks [|-]* + +E.g. + dmsetup message my_cache 0 invalidate_cblocks 2345 3456-4567 5678-6789 + Examples ======== The test suite can be found here: -https://github.com/jthornber/thinp-test-suite +https://github.com/jthornber/device-mapper-test-suite dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 8c0217753cc5..41e664b474f1 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -150,6 +150,25 @@ struct cache_stats { atomic_t discard_count; }; +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + +struct invalidation_request { + struct list_head list; + struct cblock_range *cblocks; + + atomic_t complete; + int err; + + wait_queue_head_t result_wait; +}; + struct cache { struct dm_target *ti; struct dm_target_callbacks callbacks; @@ -241,6 +260,7 @@ struct cache { bool need_tick_bio:1; bool sized:1; + bool invalidate:1; bool commit_requested:1; bool loaded_mappings:1; bool loaded_discards:1; @@ -251,6 +271,12 @@ struct cache { struct cache_features features; struct cache_stats stats; + + /* + * Invalidation fields. + */ + spinlock_t invalidation_lock; + struct list_head invalidation_requests; }; struct per_bio_data { @@ -283,6 +309,7 @@ struct dm_cache_migration { bool demote:1; bool promote:1; bool requeue_holder:1; + bool invalidate:1; struct dm_bio_prison_cell *old_ocell; struct dm_bio_prison_cell *new_ocell; @@ -904,8 +931,11 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) list_add_tail(&mg->list, &cache->quiesced_migrations); spin_unlock_irqrestore(&cache->lock, flags); - } else + } else { + if (mg->invalidate) + policy_remove_mapping(cache->policy, mg->old_oblock); cleanup_migration(mg); + } } else { if (mg->requeue_holder) @@ -1115,6 +1145,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->demote = false; mg->promote = true; mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->new_oblock = oblock; mg->cblock = cblock; @@ -1137,6 +1168,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->demote = false; mg->promote = false; mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->old_oblock = oblock; mg->cblock = cblock; @@ -1161,6 +1193,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->demote = true; mg->promote = true; mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->old_oblock = old_oblock; mg->new_oblock = new_oblock; @@ -1188,6 +1221,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs, mg->demote = true; mg->promote = false; mg->requeue_holder = true; + mg->invalidate = true; mg->cache = cache; mg->old_oblock = oblock; mg->cblock = cblock; @@ -1524,6 +1558,58 @@ static void writeback_some_dirty_blocks(struct cache *cache) prealloc_free_structs(cache, &structs); } +/*---------------------------------------------------------------- + * Invalidations. + * Dropping something from the cache *without* writing back. + *--------------------------------------------------------------*/ + +static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) +{ + int r = 0; + uint64_t begin = from_cblock(req->cblocks->begin); + uint64_t end = from_cblock(req->cblocks->end); + + while (begin != end) { + r = policy_remove_cblock(cache->policy, to_cblock(begin)); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); + if (r) + break; + + } else if (r == -ENODATA) { + /* harmless, already unmapped */ + r = 0; + + } else { + DMERR("policy_remove_cblock failed"); + break; + } + + begin++; + } + + cache->commit_requested = true; + + req->err = r; + atomic_set(&req->complete, 1); + + wake_up(&req->result_wait); +} + +static void process_invalidation_requests(struct cache *cache) +{ + struct list_head list; + struct invalidation_request *req, *tmp; + + INIT_LIST_HEAD(&list); + spin_lock(&cache->invalidation_lock); + list_splice_init(&cache->invalidation_requests, &list); + spin_unlock(&cache->invalidation_lock); + + list_for_each_entry_safe (req, tmp, &list, list) + process_invalidation_request(cache, req); +} + /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ @@ -1593,7 +1679,8 @@ static int more_work(struct cache *cache) !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations); + !list_empty(&cache->need_commit_migrations) || + cache->invalidate; } static void do_worker(struct work_struct *ws) @@ -1605,6 +1692,7 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); process_deferred_writethrough_bios(cache); process_deferred_bios(cache); + process_invalidation_requests(cache); } process_migrations(cache, &cache->quiesced_migrations, issue_copy); @@ -2271,6 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->need_tick_bio = true; cache->sized = false; + cache->invalidate = false; cache->commit_requested = false; cache->loaded_mappings = false; cache->loaded_discards = false; @@ -2284,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->stats.commit_count, 0); atomic_set(&cache->stats.discard_count, 0); + spin_lock_init(&cache->invalidation_lock); + INIT_LIST_HEAD(&cache->invalidation_requests); + *result = cache; return 0; @@ -2833,7 +2925,128 @@ err: } /* - * Supports . + * A cache block range can take two forms: + * + * i) A single cblock, eg. '3456' + * ii) A begin and end cblock with dots between, eg. 123-234 + */ +static int parse_cblock_range(struct cache *cache, const char *str, + struct cblock_range *result) +{ + char dummy; + uint64_t b, e; + int r; + + /* + * Try and parse form (ii) first. + */ + r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); + if (r < 0) + return r; + + if (r == 2) { + result->begin = to_cblock(b); + result->end = to_cblock(e); + return 0; + } + + /* + * That didn't work, try form (i). + */ + r = sscanf(str, "%llu%c", &b, &dummy); + if (r < 0) + return r; + + if (r == 1) { + result->begin = to_cblock(b); + result->end = to_cblock(from_cblock(result->begin) + 1u); + return 0; + } + + DMERR("invalid cblock range '%s'", str); + return -EINVAL; +} + +static int validate_cblock_range(struct cache *cache, struct cblock_range *range) +{ + uint64_t b = from_cblock(range->begin); + uint64_t e = from_cblock(range->end); + uint64_t n = from_cblock(cache->cache_size); + + if (b >= n) { + DMERR("begin cblock out of range: %llu >= %llu", b, n); + return -EINVAL; + } + + if (e > n) { + DMERR("end cblock out of range: %llu > %llu", e, n); + return -EINVAL; + } + + if (b >= e) { + DMERR("invalid cblock range: %llu >= %llu", b, e); + return -EINVAL; + } + + return 0; +} + +static int request_invalidation(struct cache *cache, struct cblock_range *range) +{ + struct invalidation_request req; + + INIT_LIST_HEAD(&req.list); + req.cblocks = range; + atomic_set(&req.complete, 0); + req.err = 0; + init_waitqueue_head(&req.result_wait); + + spin_lock(&cache->invalidation_lock); + list_add(&req.list, &cache->invalidation_requests); + spin_unlock(&cache->invalidation_lock); + wake_worker(cache); + + wait_event(req.result_wait, atomic_read(&req.complete)); + return req.err; +} + +static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, + const char **cblock_ranges) +{ + int r = 0; + unsigned i; + struct cblock_range range; + + if (!passthrough_mode(&cache->features)) { + DMERR("cache has to be in passthrough mode for invalidation"); + return -EPERM; + } + + for (i = 0; i < count; i++) { + r = parse_cblock_range(cache, cblock_ranges[i], &range); + if (r) + break; + + r = validate_cblock_range(cache, &range); + if (r) + break; + + /* + * Pass begin and end origin blocks to the worker and wake it. + */ + r = request_invalidation(cache, &range); + if (r) + break; + } + + return r; +} + +/* + * Supports + * " " + * and + * "invalidate_cblocks [()|(-)]* * * The key migration_threshold is supported by the cache target core. */ @@ -2841,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) { struct cache *cache = ti->private; + if (!argc) + return -EINVAL; + + if (!strcmp(argv[0], "invalidate_cblocks")) + return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); + if (argc != 2) return -EINVAL; From 7b6b2bc98c0303b7f043ad5b35906f833e56308d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 12 Nov 2013 12:17:43 -0500 Subject: [PATCH 32/32] dm cache: resolve small nits and improve Documentation Document passthrough mode, cache shrinking, and cache invalidation. Also, use strcasecmp() and hlist_unhashed(). Reported-by: Alasdair G Kergon Signed-off-by: Mike Snitzer --- Documentation/device-mapper/cache.txt | 42 ++++++++++++++++++++------- drivers/md/dm-cache-policy-mq.c | 2 +- drivers/md/dm-cache-target.c | 2 +- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index fc9d2dfb9415..274752f8bdf9 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -86,16 +86,27 @@ If passthrough is selected, useful when the cache contents are not known to be coherent with the origin device, then all reads are served from the origin device (all reads miss the cache) and all writes are forwarded to the origin device; additionally, write hits cause cache -block invalidates. Passthrough mode allows a cache device to be -activated without having to worry about coherency. Coherency that -exists is maintained, although the cache will gradually cool as writes -take place. If the coherency of the cache can later be verified, or -established, the cache device can can be transitioned to writethrough or -writeback mode while still warm. Otherwise, the cache contents can be -discarded prior to transitioning to the desired operating mode. +block invalidates. To enable passthrough mode the cache must be clean. +Passthrough mode allows a cache device to be activated without having to +worry about coherency. Coherency that exists is maintained, although +the cache will gradually cool as writes take place. If the coherency of +the cache can later be verified, or established through use of the +"invalidate_cblocks" message, the cache device can be transitioned to +writethrough or writeback mode while still warm. Otherwise, the cache +contents can be discarded prior to transitioning to the desired +operating mode. A simple cleaner policy is provided, which will clean (write back) all -dirty blocks in a cache. Useful for decommissioning a cache. +dirty blocks in a cache. Useful for decommissioning a cache or when +shrinking a cache. Shrinking the cache's fast device requires all cache +blocks, in the area of the cache being removed, to be clean. If the +area being removed from the cache still contains dirty blocks the resize +will fail. Care must be taken to never reduce the volume used for the +cache's fast device until the cache is clean. This is of particular +importance if writeback mode is used. Writethrough and passthrough +modes already maintain a clean cache. Future support to partially clean +the cache, above a specified threshold, will allow for keeping the cache +warm and in writeback mode during resize. Migration throttling -------------------- @@ -174,7 +185,7 @@ Constructor block size : cache unit size in sectors #feature args : number of feature arguments passed - feature args : writethrough. (The default is writeback.) + feature args : writethrough or passthrough (The default is writeback.) policy : the replacement policy to use #policy args : an even number of arguments corresponding to @@ -190,6 +201,13 @@ Optional feature arguments are: back cache block contents later for performance reasons, so they may differ from the corresponding origin blocks. + passthrough : a degraded mode useful for various cache coherency + situations (e.g., rolling back snapshots of + underlying storage). Reads and writes always go to + the origin. If a write goes to a cached origin + block, then the cache block is invalidated. + To enable passthrough mode the cache must be clean. + A policy called 'default' is always registered. This is an alias for the policy we currently think is giving best all round performance. @@ -247,7 +265,11 @@ E.g. Invalidation is removing an entry from the cache without writing it back. Cache blocks can be invalidated via the invalidate_cblocks -message, which takes an arbitrary number of cblock ranges. +message, which takes an arbitrary number of cblock ranges. Each cblock +must be expressed as a decimal value, in the future a variant message +that takes cblock ranges expressed in hexidecimal may be needed to +better support efficient invalidation of larger caches. The cache must +be in passthrough mode when invalidate_cblocks is used. invalidate_cblocks [|-]* diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 7209fab8b8ed..416b7b752a6e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -310,7 +310,7 @@ static void free_entry(struct entry_pool *ep, struct entry *e) static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) { struct entry *e = ep->entries + from_cblock(cblock); - return e->hlist.pprev ? e : NULL; + return !hlist_unhashed(&e->hlist) ? e : NULL; } static bool epool_empty(struct entry_pool *ep) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 41e664b474f1..9efcf1059b99 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -3057,7 +3057,7 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) if (!argc) return -EINVAL; - if (!strcmp(argv[0], "invalidate_cblocks")) + if (!strcasecmp(argv[0], "invalidate_cblocks")) return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); if (argc != 2)