linux/drivers/md/dm-mpath.c

1753 lines
40 KiB
C
Raw Normal View History

/*
* Copyright (C) 2003 Sistina Software Limited.
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
#include <linux/device-mapper.h>
#include "dm.h"
#include "dm-path-selector.h"
#include "dm-uevent.h"
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/workqueue.h>
#include <linux/delay.h>
#include <scsi/scsi_dh.h>
#include <linux/atomic.h>
#define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
/* Path properties */
struct pgpath {
struct list_head list;
struct priority_group *pg; /* Owning PG */
unsigned is_active; /* Path status */
unsigned fail_count; /* Cumulative failure count */
struct dm_path path;
struct delayed_work activate_path;
};
#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
/*
* Paths are grouped into Priority Groups and numbered from 1 upwards.
* Each has a path selector which controls which path gets used.
*/
struct priority_group {
struct list_head list;
struct multipath *m; /* Owning multipath instance */
struct path_selector ps;
unsigned pg_num; /* Reference number */
unsigned bypassed; /* Temporarily bypass this PG? */
unsigned nr_pgpaths; /* Number of paths in PG */
struct list_head pgpaths;
};
/* Multipath context */
struct multipath {
struct list_head list;
struct dm_target *ti;
const char *hw_handler_name;
char *hw_handler_params;
spinlock_t lock;
unsigned nr_priority_groups;
struct list_head priority_groups;
wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
unsigned pg_init_required; /* pg_init needs calling? */
unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
unsigned pg_init_delay_retry; /* Delay pg_init retry? */
unsigned nr_valid_paths; /* Total number of usable paths */
struct pgpath *current_pgpath;
struct priority_group *current_pg;
struct priority_group *next_pg; /* Switch to this PG if set */
unsigned repeat_count; /* I/Os left before calling PS again */
unsigned queue_io:1; /* Must we queue all I/O? */
unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
dm mpath: fix race condition between multipath_dtr and pg_init_done Whenever multipath_dtr() is happening we must prevent queueing any further path activation work. Implement this by adding a new 'pg_init_disabled' flag to the multipath structure that denotes future path activation work should be skipped if it is set. By disabling pg_init and then re-enabling in flush_multipath_work() we also avoid the potential for pg_init to be initiated while suspending an mpath device. Without this patch a race condition exists that may result in a kernel panic: 1) If after pg_init_done() decrements pg_init_in_progress to 0, a call to wait_for_pg_init_completion() assumes there are no more pending path management commands. 2) If pg_init_required is set by pg_init_done(), due to retryable mode_select errors, then process_queued_ios() will again queue the path activation work. 3) If free_multipath() completes before activate_path() work is called a NULL pointer dereference like the following can be seen when accessing members of the recently destructed multipath: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 RIP: 0010:[<ffffffffa003db1b>] [<ffffffffa003db1b>] activate_path+0x1b/0x30 [dm_multipath] [<ffffffff81090ac0>] worker_thread+0x170/0x2a0 [<ffffffff81096c80>] ? autoremove_wake_function+0x0/0x40 [switch to disabling pg_init in flush_multipath_work & header edits by Mike Snitzer] Signed-off-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com> Reviewed-by: Krishnasamy Somasundaram <somasundaram.krishnasamy@netapp.com> Tested-by: Speagle Andy <Andy.Speagle@netapp.com> Acked-by: Junichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org
2013-10-30 03:26:38 +00:00
unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
struct work_struct trigger_event;
/*
* We must use a mempool of dm_mpath_io structs so that we
* can resubmit bios on error.
*/
mempool_t *mpio_pool;
struct mutex work_mutex;
};
/*
* Context information attached to each bio we process.
*/
struct dm_mpath_io {
struct pgpath *pgpath;
size_t nr_bytes;
};
typedef int (*action_fn) (struct pgpath *pgpath);
static struct kmem_cache *_mpio_cache;
static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
static void trigger_event(struct work_struct *work);
static void activate_path(struct work_struct *work);
static int __pgpath_busy(struct pgpath *pgpath);
/*-----------------------------------------------
* Allocation routines
*-----------------------------------------------*/
static struct pgpath *alloc_pgpath(void)
{
struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
if (pgpath) {
pgpath->is_active = 1;
INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
}
return pgpath;
}
static void free_pgpath(struct pgpath *pgpath)
{
kfree(pgpath);
}
static struct priority_group *alloc_priority_group(void)
{
struct priority_group *pg;
pg = kzalloc(sizeof(*pg), GFP_KERNEL);
if (pg)
INIT_LIST_HEAD(&pg->pgpaths);
return pg;
}
static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
{
struct pgpath *pgpath, *tmp;
struct multipath *m = ti->private;
list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
list_del(&pgpath->list);
if (m->hw_handler_name)
scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
dm_put_device(ti, pgpath->path.dev);
free_pgpath(pgpath);
}
}
static void free_priority_group(struct priority_group *pg,
struct dm_target *ti)
{
struct path_selector *ps = &pg->ps;
if (ps->type) {
ps->type->destroy(ps);
dm_put_path_selector(ps->type);
}
free_pgpaths(&pg->pgpaths, ti);
kfree(pg);
}
static struct multipath *alloc_multipath(struct dm_target *ti)
{
struct multipath *m;
unsigned min_ios = dm_get_reserved_rq_based_ios();
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
INIT_LIST_HEAD(&m->priority_groups);
spin_lock_init(&m->lock);
m->queue_io = 1;
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
INIT_WORK(&m->trigger_event, trigger_event);
init_waitqueue_head(&m->pg_init_wait);
mutex_init(&m->work_mutex);
m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
return NULL;
}
m->ti = ti;
ti->private = m;
}
return m;
}
static void free_multipath(struct multipath *m)
{
struct priority_group *pg, *tmp;
list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
list_del(&pg->list);
free_priority_group(pg, m->ti);
}
kfree(m->hw_handler_name);
kfree(m->hw_handler_params);
mempool_destroy(m->mpio_pool);
kfree(m);
}
static int set_mapinfo(struct multipath *m, union map_info *info)
{
struct dm_mpath_io *mpio;
mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
if (!mpio)
return -ENOMEM;
memset(mpio, 0, sizeof(*mpio));
info->ptr = mpio;
return 0;
}
static void clear_mapinfo(struct multipath *m, union map_info *info)
{
struct dm_mpath_io *mpio = info->ptr;
info->ptr = NULL;
mempool_free(mpio, m->mpio_pool);
}
/*-----------------------------------------------
* Path selection
*-----------------------------------------------*/
static int __pg_init_all_paths(struct multipath *m)
{
struct pgpath *pgpath;
unsigned long pg_init_delay = 0;
if (m->pg_init_in_progress || m->pg_init_disabled)
return 0;
m->pg_init_count++;
m->pg_init_required = 0;
/* Check here to reset pg_init_required */
if (!m->current_pg)
return 0;
if (m->pg_init_delay_retry)
pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
/* Skip failed paths */
if (!pgpath->is_active)
continue;
if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
pg_init_delay))
m->pg_init_in_progress++;
}
return m->pg_init_in_progress;
}
static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
{
m->current_pg = pgpath->pg;
/* Must we initialise the PG first, and queue I/O till it's ready? */
if (m->hw_handler_name) {
m->pg_init_required = 1;
m->queue_io = 1;
} else {
m->pg_init_required = 0;
m->queue_io = 0;
}
m->pg_init_count = 0;
}
static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
size_t nr_bytes)
{
struct dm_path *path;
path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
if (!path)
return -ENXIO;
m->current_pgpath = path_to_pgpath(path);
if (m->current_pg != pg)
__switch_pg(m, m->current_pgpath);
return 0;
}
static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
{
struct priority_group *pg;
unsigned bypassed = 1;
if (!m->nr_valid_paths)
goto failed;
/* Were we instructed to switch PG? */
if (m->next_pg) {
pg = m->next_pg;
m->next_pg = NULL;
if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
/* Don't change PG until it has no remaining paths */
if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
return;
/*
* Loop through priority groups until we find a valid path.
* First time we skip PGs marked 'bypassed'.
* Second time we only try the ones we skipped, but set
* pg_init_delay_retry so we do not hammer controllers.
*/
do {
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed)
continue;
if (!__choose_path_in_pg(m, pg, nr_bytes)) {
if (!bypassed)
m->pg_init_delay_retry = 1;
return;
}
}
} while (bypassed--);
failed:
m->current_pgpath = NULL;
m->current_pg = NULL;
}
/*
* Check whether bios must be queued in the device-mapper core rather
* than here in the target.
*
* m->lock must be held on entry.
*
* If m->queue_if_no_path and m->saved_queue_if_no_path hold the
* same value then we are not between multipath_presuspend()
* and multipath_resume() calls and we have no need to check
* for the DMF_NOFLUSH_SUSPENDING flag.
*/
static int __must_push_back(struct multipath *m)
{
return (m->queue_if_no_path ||
(m->queue_if_no_path != m->saved_queue_if_no_path &&
dm_noflush_suspending(m->ti)));
}
#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required)
/*
* Map cloned requests
*/
static int multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context)
{
struct multipath *m = (struct multipath *) ti->private;
int r = DM_MAPIO_REQUEUE;
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
size_t nr_bytes = blk_rq_bytes(clone);
unsigned long flags;
struct pgpath *pgpath;
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
struct block_device *bdev;
struct dm_mpath_io *mpio;
spin_lock_irqsave(&m->lock, flags);
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
(!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
__choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
if (!pgpath) {
if (!__must_push_back(m))
r = -EIO; /* Failed */
goto out_unlock;
}
if (!pg_ready(m)) {
__pg_init_all_paths(m);
goto out_unlock;
}
if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
goto out_unlock;
bdev = pgpath->path.dev->bdev;
clone->q = bdev_get_queue(bdev);
clone->rq_disk = bdev->bd_disk;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
mpio = map_context->ptr;
mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes;
if (pgpath->pg->ps.type->start_io)
pgpath->pg->ps.type->start_io(&pgpath->pg->ps,
&pgpath->path,
nr_bytes);
r = DM_MAPIO_REMAPPED;
out_unlock:
spin_unlock_irqrestore(&m->lock, flags);
return r;
}
/*
* If we run out of usable paths, should we queue I/O or error it?
*/
static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
unsigned save_old_value)
{
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
if (save_old_value)
m->saved_queue_if_no_path = m->queue_if_no_path;
else
m->saved_queue_if_no_path = queue_if_no_path;
m->queue_if_no_path = queue_if_no_path;
if (!m->queue_if_no_path)
dm_table_run_md_queue_async(m->ti->table);
spin_unlock_irqrestore(&m->lock, flags);
return 0;
}
/*
* An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass.
*/
static void trigger_event(struct work_struct *work)
{
struct multipath *m =
container_of(work, struct multipath, trigger_event);
dm_table_event(m->ti->table);
}
/*-----------------------------------------------------------------
* Constructor/argument parsing:
* <#multipath feature args> [<arg>]*
* <#hw_handler args> [hw_handler [<arg>]*]
* <#priority groups>
* <initial priority group>
* [<selector> <#selector args> [<arg>]*
* <#paths> <#per-path selector args>
* [<path> [<arg>]* ]+ ]+
*---------------------------------------------------------------*/
static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
struct dm_target *ti)
{
int r;
struct path_selector_type *pst;
unsigned ps_argc;
static struct dm_arg _args[] = {
{0, 1024, "invalid number of path selector args"},
};
pst = dm_get_path_selector(dm_shift_arg(as));
if (!pst) {
ti->error = "unknown path selector type";
return -EINVAL;
}
r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
if (r) {
dm_put_path_selector(pst);
return -EINVAL;
}
r = pst->create(&pg->ps, ps_argc, as->argv);
if (r) {
dm_put_path_selector(pst);
ti->error = "path selector constructor failed";
return r;
}
pg->ps.type = pst;
dm_consume_args(as, ps_argc);
return 0;
}
static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
struct dm_target *ti)
{
int r;
struct pgpath *p;
struct multipath *m = ti->private;
struct request_queue *q = NULL;
const char *attached_handler_name;
/* we need at least a path arg */
if (as->argc < 1) {
ti->error = "no device given";
return ERR_PTR(-EINVAL);
}
p = alloc_pgpath();
if (!p)
return ERR_PTR(-ENOMEM);
r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
&p->path.dev);
if (r) {
ti->error = "error getting device";
goto bad;
}
if (m->retain_attached_hw_handler || m->hw_handler_name)
q = bdev_get_queue(p->path.dev->bdev);
if (m->retain_attached_hw_handler) {
attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
if (attached_handler_name) {
/*
* Reset hw_handler_name to match the attached handler
* and clear any hw_handler_params associated with the
* ignored handler.
*
* NB. This modifies the table line to show the actual
* handler instead of the original table passed in.
*/
kfree(m->hw_handler_name);
m->hw_handler_name = attached_handler_name;
kfree(m->hw_handler_params);
m->hw_handler_params = NULL;
}
}
if (m->hw_handler_name) {
/*
* Increments scsi_dh reference, even when using an
* already-attached handler.
*/
r = scsi_dh_attach(q, m->hw_handler_name);
if (r == -EBUSY) {
/*
* Already attached to different hw_handler:
* try to reattach with correct one.
*/
scsi_dh_detach(q);
r = scsi_dh_attach(q, m->hw_handler_name);
}
if (r < 0) {
ti->error = "error attaching hardware handler";
dm_put_device(ti, p->path.dev);
goto bad;
}
if (m->hw_handler_params) {
r = scsi_dh_set_params(q, m->hw_handler_params);
if (r < 0) {
ti->error = "unable to set hardware "
"handler parameters";
scsi_dh_detach(q);
dm_put_device(ti, p->path.dev);
goto bad;
}
}
}
r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
if (r) {
dm_put_device(ti, p->path.dev);
goto bad;
}
return p;
bad:
free_pgpath(p);
return ERR_PTR(r);
}
static struct priority_group *parse_priority_group(struct dm_arg_set *as,
struct multipath *m)
{
static struct dm_arg _args[] = {
{1, 1024, "invalid number of paths"},
{0, 1024, "invalid number of selector args"}
};
int r;
unsigned i, nr_selector_args, nr_args;
struct priority_group *pg;
struct dm_target *ti = m->ti;
if (as->argc < 2) {
as->argc = 0;
ti->error = "not enough priority group arguments";
return ERR_PTR(-EINVAL);
}
pg = alloc_priority_group();
if (!pg) {
ti->error = "couldn't allocate priority group";
return ERR_PTR(-ENOMEM);
}
pg->m = m;
r = parse_path_selector(as, pg, ti);
if (r)
goto bad;
/*
* read the paths
*/
r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
if (r)
goto bad;
r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
if (r)
goto bad;
nr_args = 1 + nr_selector_args;
for (i = 0; i < pg->nr_pgpaths; i++) {
struct pgpath *pgpath;
struct dm_arg_set path_args;
if (as->argc < nr_args) {
ti->error = "not enough path parameters";
r = -EINVAL;
goto bad;
}
path_args.argc = nr_args;
path_args.argv = as->argv;
pgpath = parse_path(&path_args, &pg->ps, ti);
if (IS_ERR(pgpath)) {
r = PTR_ERR(pgpath);
goto bad;
}
pgpath->pg = pg;
list_add_tail(&pgpath->list, &pg->pgpaths);
dm_consume_args(as, nr_args);
}
return pg;
bad:
free_priority_group(pg, ti);
return ERR_PTR(r);
}
static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
{
unsigned hw_argc;
int ret;
struct dm_target *ti = m->ti;
static struct dm_arg _args[] = {
{0, 1024, "invalid number of hardware handler args"},
};
if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
return -EINVAL;
if (!hw_argc)
return 0;
m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
"scsi_dh_%s", m->hw_handler_name)) {
ti->error = "unknown hardware handler type";
ret = -EINVAL;
goto fail;
}
if (hw_argc > 1) {
char *p;
int i, j, len = 4;
for (i = 0; i <= hw_argc - 2; i++)
len += strlen(as->argv[i]) + 1;
p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
if (!p) {
ti->error = "memory allocation failed";
ret = -ENOMEM;
goto fail;
}
j = sprintf(p, "%d", hw_argc - 1);
for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
j = sprintf(p, "%s", as->argv[i]);
}
dm_consume_args(as, hw_argc - 1);
return 0;
fail:
kfree(m->hw_handler_name);
m->hw_handler_name = NULL;
return ret;
}
static int parse_features(struct dm_arg_set *as, struct multipath *m)
{
int r;
unsigned argc;
struct dm_target *ti = m->ti;
const char *arg_name;
static struct dm_arg _args[] = {
{0, 6, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
};
r = dm_read_arg_group(_args, as, &argc, &ti->error);
if (r)
return -EINVAL;
if (!argc)
return 0;
do {
arg_name = dm_shift_arg(as);
argc--;
if (!strcasecmp(arg_name, "queue_if_no_path")) {
r = queue_if_no_path(m, 1, 0);
continue;
}
if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
m->retain_attached_hw_handler = 1;
continue;
}
if (!strcasecmp(arg_name, "pg_init_retries") &&
(argc >= 1)) {
r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
argc--;
continue;
}
if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
(argc >= 1)) {
r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
argc--;
continue;
}
ti->error = "Unrecognised multipath feature request";
r = -EINVAL;
} while (argc && !r);
return r;
}
static int multipath_ctr(struct dm_target *ti, unsigned int argc,
char **argv)
{
/* target arguments */
static struct dm_arg _args[] = {
{0, 1024, "invalid number of priority groups"},
{0, 1024, "invalid initial priority group number"},
};
int r;
struct multipath *m;
struct dm_arg_set as;
unsigned pg_count = 0;
unsigned next_pg_num;
as.argc = argc;
as.argv = argv;
m = alloc_multipath(ti);
if (!m) {
ti->error = "can't allocate multipath";
return -EINVAL;
}
r = parse_features(&as, m);
if (r)
goto bad;
r = parse_hw_handler(&as, m);
if (r)
goto bad;
r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
if (r)
goto bad;
r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
if (r)
goto bad;
if ((!m->nr_priority_groups && next_pg_num) ||
(m->nr_priority_groups && !next_pg_num)) {
ti->error = "invalid initial priority group";
r = -EINVAL;
goto bad;
}
/* parse the priority groups */
while (as.argc) {
struct priority_group *pg;
pg = parse_priority_group(&as, m);
if (IS_ERR(pg)) {
r = PTR_ERR(pg);
goto bad;
}
m->nr_valid_paths += pg->nr_pgpaths;
list_add_tail(&pg->list, &m->priority_groups);
pg_count++;
pg->pg_num = pg_count;
if (!--next_pg_num)
m->next_pg = pg;
}
if (pg_count != m->nr_priority_groups) {
ti->error = "priority group count mismatch";
r = -EINVAL;
goto bad;
}
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
return 0;
bad:
free_multipath(m);
return r;
}
static void multipath_wait_for_pg_init_completion(struct multipath *m)
{
DECLARE_WAITQUEUE(wait, current);
unsigned long flags;
add_wait_queue(&m->pg_init_wait, &wait);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
spin_lock_irqsave(&m->lock, flags);
if (!m->pg_init_in_progress) {
spin_unlock_irqrestore(&m->lock, flags);
break;
}
spin_unlock_irqrestore(&m->lock, flags);
io_schedule();
}
set_current_state(TASK_RUNNING);
remove_wait_queue(&m->pg_init_wait, &wait);
}
static void flush_multipath_work(struct multipath *m)
{
dm mpath: fix race condition between multipath_dtr and pg_init_done Whenever multipath_dtr() is happening we must prevent queueing any further path activation work. Implement this by adding a new 'pg_init_disabled' flag to the multipath structure that denotes future path activation work should be skipped if it is set. By disabling pg_init and then re-enabling in flush_multipath_work() we also avoid the potential for pg_init to be initiated while suspending an mpath device. Without this patch a race condition exists that may result in a kernel panic: 1) If after pg_init_done() decrements pg_init_in_progress to 0, a call to wait_for_pg_init_completion() assumes there are no more pending path management commands. 2) If pg_init_required is set by pg_init_done(), due to retryable mode_select errors, then process_queued_ios() will again queue the path activation work. 3) If free_multipath() completes before activate_path() work is called a NULL pointer dereference like the following can be seen when accessing members of the recently destructed multipath: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 RIP: 0010:[<ffffffffa003db1b>] [<ffffffffa003db1b>] activate_path+0x1b/0x30 [dm_multipath] [<ffffffff81090ac0>] worker_thread+0x170/0x2a0 [<ffffffff81096c80>] ? autoremove_wake_function+0x0/0x40 [switch to disabling pg_init in flush_multipath_work & header edits by Mike Snitzer] Signed-off-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com> Reviewed-by: Krishnasamy Somasundaram <somasundaram.krishnasamy@netapp.com> Tested-by: Speagle Andy <Andy.Speagle@netapp.com> Acked-by: Junichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org
2013-10-30 03:26:38 +00:00
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
m->pg_init_disabled = 1;
spin_unlock_irqrestore(&m->lock, flags);
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
workqueue: deprecate flush[_delayed]_work_sync() flush[_delayed]_work_sync() are now spurious. Mark them deprecated and convert all users to flush[_delayed]_work(). If you're cc'd and wondering what's going on: Now all workqueues are non-reentrant and the regular flushes guarantee that the work item is not pending or running on any CPU on return, so there's no reason to use the sync flushes at all and they're going away. This patch doesn't make any functional difference. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Russell King <linux@arm.linux.org.uk> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Ian Campbell <ian.campbell@citrix.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Mattia Dongili <malattia@linux.it> Cc: Kent Yoder <key@linux.vnet.ibm.com> Cc: David Airlie <airlied@linux.ie> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Karsten Keil <isdn@linux-pingi.de> Cc: Bryan Wu <bryan.wu@canonical.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@infradead.org> Cc: Florian Tobias Schandinat <FlorianSchandinat@gmx.de> Cc: David Woodhouse <dwmw2@infradead.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: linux-wireless@vger.kernel.org Cc: Anton Vorontsov <cbou@mail.ru> Cc: Sangbeom Kim <sbkim73@samsung.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Eric Van Hensbergen <ericvh@gmail.com> Cc: Takashi Iwai <tiwai@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Petr Vandrovec <petr@vandrovec.name> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Avi Kivity <avi@redhat.com>
2012-08-20 21:51:24 +00:00
flush_work(&m->trigger_event);
dm mpath: fix race condition between multipath_dtr and pg_init_done Whenever multipath_dtr() is happening we must prevent queueing any further path activation work. Implement this by adding a new 'pg_init_disabled' flag to the multipath structure that denotes future path activation work should be skipped if it is set. By disabling pg_init and then re-enabling in flush_multipath_work() we also avoid the potential for pg_init to be initiated while suspending an mpath device. Without this patch a race condition exists that may result in a kernel panic: 1) If after pg_init_done() decrements pg_init_in_progress to 0, a call to wait_for_pg_init_completion() assumes there are no more pending path management commands. 2) If pg_init_required is set by pg_init_done(), due to retryable mode_select errors, then process_queued_ios() will again queue the path activation work. 3) If free_multipath() completes before activate_path() work is called a NULL pointer dereference like the following can be seen when accessing members of the recently destructed multipath: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 RIP: 0010:[<ffffffffa003db1b>] [<ffffffffa003db1b>] activate_path+0x1b/0x30 [dm_multipath] [<ffffffff81090ac0>] worker_thread+0x170/0x2a0 [<ffffffff81096c80>] ? autoremove_wake_function+0x0/0x40 [switch to disabling pg_init in flush_multipath_work & header edits by Mike Snitzer] Signed-off-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com> Reviewed-by: Krishnasamy Somasundaram <somasundaram.krishnasamy@netapp.com> Tested-by: Speagle Andy <Andy.Speagle@netapp.com> Acked-by: Junichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org
2013-10-30 03:26:38 +00:00
spin_lock_irqsave(&m->lock, flags);
m->pg_init_disabled = 0;
spin_unlock_irqrestore(&m->lock, flags);
}
static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
flush_multipath_work(m);
free_multipath(m);
}
/*
* Take a path out of use.
*/
static int fail_path(struct pgpath *pgpath)
{
unsigned long flags;
struct multipath *m = pgpath->pg->m;
spin_lock_irqsave(&m->lock, flags);
if (!pgpath->is_active)
goto out;
DMWARN("Failing path %s.", pgpath->path.dev->name);
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
pgpath->is_active = 0;
pgpath->fail_count++;
m->nr_valid_paths--;
if (pgpath == m->current_pgpath)
m->current_pgpath = NULL;
dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
pgpath->path.dev->name, m->nr_valid_paths);
schedule_work(&m->trigger_event);
out:
spin_unlock_irqrestore(&m->lock, flags);
return 0;
}
/*
* Reinstate a previously-failed path
*/
static int reinstate_path(struct pgpath *pgpath)
{
int r = 0;
unsigned long flags;
struct multipath *m = pgpath->pg->m;
spin_lock_irqsave(&m->lock, flags);
if (pgpath->is_active)
goto out;
if (!pgpath->pg->ps.type->reinstate_path) {
DMWARN("Reinstate path not supported by path selector %s",
pgpath->pg->ps.type->name);
r = -EINVAL;
goto out;
}
r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
if (r)
goto out;
pgpath->is_active = 1;
if (!m->nr_valid_paths++) {
m->current_pgpath = NULL;
dm_table_run_md_queue_async(m->ti->table);
} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
m->pg_init_in_progress++;
}
dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
pgpath->path.dev->name, m->nr_valid_paths);
schedule_work(&m->trigger_event);
out:
spin_unlock_irqrestore(&m->lock, flags);
return r;
}
/*
* Fail or reinstate all paths that match the provided struct dm_dev.
*/
static int action_dev(struct multipath *m, struct dm_dev *dev,
action_fn action)
{
int r = -EINVAL;
struct pgpath *pgpath;
struct priority_group *pg;
list_for_each_entry(pg, &m->priority_groups, list) {
list_for_each_entry(pgpath, &pg->pgpaths, list) {
if (pgpath->path.dev == dev)
r = action(pgpath);
}
}
return r;
}
/*
* Temporarily try to avoid having to use the specified PG
*/
static void bypass_pg(struct multipath *m, struct priority_group *pg,
int bypassed)
{
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
pg->bypassed = bypassed;
m->current_pgpath = NULL;
m->current_pg = NULL;
spin_unlock_irqrestore(&m->lock, flags);
schedule_work(&m->trigger_event);
}
/*
* Switch to using the specified PG from the next I/O that gets mapped
*/
static int switch_pg_num(struct multipath *m, const char *pgstr)
{
struct priority_group *pg;
unsigned pgnum;
unsigned long flags;
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to switch_pg_num");
return -EINVAL;
}
spin_lock_irqsave(&m->lock, flags);
list_for_each_entry(pg, &m->priority_groups, list) {
pg->bypassed = 0;
if (--pgnum)
continue;
m->current_pgpath = NULL;
m->current_pg = NULL;
m->next_pg = pg;
}
spin_unlock_irqrestore(&m->lock, flags);
schedule_work(&m->trigger_event);
return 0;
}
/*
* Set/clear bypassed status of a PG.
* PGs are numbered upwards from 1 in the order they were declared.
*/
static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
{
struct priority_group *pg;
unsigned pgnum;
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to bypass_pg");
return -EINVAL;
}
list_for_each_entry(pg, &m->priority_groups, list) {
if (!--pgnum)
break;
}
bypass_pg(m, pg, bypassed);
return 0;
}
/*
* Should we retry pg_init immediately?
*/
static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
{
unsigned long flags;
int limit_reached = 0;
spin_lock_irqsave(&m->lock, flags);
dm mpath: fix race condition between multipath_dtr and pg_init_done Whenever multipath_dtr() is happening we must prevent queueing any further path activation work. Implement this by adding a new 'pg_init_disabled' flag to the multipath structure that denotes future path activation work should be skipped if it is set. By disabling pg_init and then re-enabling in flush_multipath_work() we also avoid the potential for pg_init to be initiated while suspending an mpath device. Without this patch a race condition exists that may result in a kernel panic: 1) If after pg_init_done() decrements pg_init_in_progress to 0, a call to wait_for_pg_init_completion() assumes there are no more pending path management commands. 2) If pg_init_required is set by pg_init_done(), due to retryable mode_select errors, then process_queued_ios() will again queue the path activation work. 3) If free_multipath() completes before activate_path() work is called a NULL pointer dereference like the following can be seen when accessing members of the recently destructed multipath: BUG: unable to handle kernel NULL pointer dereference at 0000000000000090 RIP: 0010:[<ffffffffa003db1b>] [<ffffffffa003db1b>] activate_path+0x1b/0x30 [dm_multipath] [<ffffffff81090ac0>] worker_thread+0x170/0x2a0 [<ffffffff81096c80>] ? autoremove_wake_function+0x0/0x40 [switch to disabling pg_init in flush_multipath_work & header edits by Mike Snitzer] Signed-off-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com> Reviewed-by: Krishnasamy Somasundaram <somasundaram.krishnasamy@netapp.com> Tested-by: Speagle Andy <Andy.Speagle@netapp.com> Acked-by: Junichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org
2013-10-30 03:26:38 +00:00
if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
m->pg_init_required = 1;
else
limit_reached = 1;
spin_unlock_irqrestore(&m->lock, flags);
return limit_reached;
}
static void pg_init_done(void *data, int errors)
{
struct pgpath *pgpath = data;
struct priority_group *pg = pgpath->pg;
struct multipath *m = pg->m;
unsigned long flags;
unsigned delay_retry = 0;
/* device or driver problems */
switch (errors) {
case SCSI_DH_OK:
break;
case SCSI_DH_NOSYS:
if (!m->hw_handler_name) {
errors = 0;
break;
}
DMERR("Could not failover the device: Handler scsi_dh_%s "
"Error %d.", m->hw_handler_name, errors);
/*
* Fail path for now, so we do not ping pong
*/
fail_path(pgpath);
break;
case SCSI_DH_DEV_TEMP_BUSY:
/*
* Probably doing something like FW upgrade on the
* controller so try the other pg.
*/
bypass_pg(m, pg, 1);
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
delay_retry = 1;
case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL:
if (pg_init_limit_reached(m, pgpath))
fail_path(pgpath);
errors = 0;
break;
default:
/*
* We probably do not want to fail the path for a device
* error, but this is what the old dm did. In future
* patches we can do more advanced handling.
*/
fail_path(pgpath);
}
spin_lock_irqsave(&m->lock, flags);
if (errors) {
if (pgpath == m->current_pgpath) {
DMERR("Could not failover device. Error %d.", errors);
m->current_pgpath = NULL;
m->current_pg = NULL;
}
} else if (!m->pg_init_required)
pg->bypassed = 0;
if (--m->pg_init_in_progress)
/* Activations of other paths are still on going */
goto out;
if (m->pg_init_required) {
m->pg_init_delay_retry = delay_retry;
if (__pg_init_all_paths(m))
goto out;
}
m->queue_io = 0;
/*
* Wake up any thread waiting to suspend.
*/
wake_up(&m->pg_init_wait);
out:
spin_unlock_irqrestore(&m->lock, flags);
}
static void activate_path(struct work_struct *work)
{
struct pgpath *pgpath =
container_of(work, struct pgpath, activate_path.work);
if (pgpath->is_active)
scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
pg_init_done, pgpath);
else
pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
}
static int noretry_error(int error)
{
switch (error) {
case -EOPNOTSUPP:
case -EREMOTEIO:
case -EILSEQ:
case -ENODATA:
case -ENOSPC:
return 1;
}
/* Anything else could be a path failure, so should be retried */
return 0;
}
/*
* end_io handling
*/
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
static int do_end_io(struct multipath *m, struct request *clone,
int error, struct dm_mpath_io *mpio)
{
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
/*
* We don't queue any clone request inside the multipath target
* during end I/O handling, since those clone requests don't have
* bio clones. If we queue them inside the multipath target,
* we need to make bio clones, that requires memory allocation.
* (See drivers/md/dm.c:end_clone_bio() about why the clone requests
* don't have bio clones.)
* Instead of queueing the clone request here, we queue the original
* request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later.
*/
int r = DM_ENDIO_REQUEUE;
unsigned long flags;
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
if (!error && !clone->errors)
return 0; /* I/O complete */
dm mpath: disable WRITE SAME if it fails Workaround the SCSI layer's problematic WRITE SAME heuristics by disabling WRITE SAME in the DM multipath device's queue_limits if an underlying device disabled it. The WRITE SAME heuristics, with both the original commit 5db44863b6eb ("[SCSI] sd: Implement support for WRITE SAME") and the updated commit 66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling WRITE SAME(10) even without successfully determining it is supported. After the first failed WRITE SAME the SCSI layer will disable WRITE SAME for the device (by setting sdkp->device->no_write_same which results in 'max_write_same_sectors' in device's queue_limits to be set to 0). When a device is stacked ontop of such a SCSI device any changes to that SCSI device's queue_limits do not automatically propagate up the stack. As such, a DM multipath device will not have its WRITE SAME support disabled. This causes the block layer to continue to issue WRITE SAME requests to the mpath device which causes paths to fail and (if mpath IO isn't configured to queue when no paths are available) it will result in actual IO errors to the upper layers. This fix doesn't help configurations that have additional devices stacked ontop of the mpath device (e.g. LVM created linear DM devices ontop). A proper fix that restacks all the queue_limits from the bottom of the device stack up will need to be explored if SCSI will continue to use this model of optimistically allowing op codes and then disabling them after they fail for the first time. Before this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. device-mapper: multipath: Failing path 8:112. end_request: I/O error, dev dm-6, sector 4616 dm-6: WRITE SAME failed. Manually zeroing. end_request: I/O error, dev dm-6, sector 4616 end_request: I/O error, dev dm-6, sector 5640 end_request: I/O error, dev dm-6, sector 6664 end_request: I/O error, dev dm-6, sector 7688 end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. end_request: I/O error, dev dm-6, sector 524296 Aborting journal on device dm-6-8. end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 33553920 After this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 0 It should be noted that WRITE SAME support wasn't enabled in DM multipath until v3.10. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: Martin K. Petersen <martin.petersen@oracle.com> Cc: Hannes Reinecke <hare@suse.de> Cc: stable@vger.kernel.org # 3.10+
2013-09-19 16:13:58 +00:00
if (noretry_error(error)) {
if ((clone->cmd_flags & REQ_WRITE_SAME) &&
!clone->q->limits.max_write_same_sectors) {
struct queue_limits *limits;
/* device doesn't really support WRITE SAME, disable it */
limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
limits->max_write_same_sectors = 0;
}
return error;
dm mpath: disable WRITE SAME if it fails Workaround the SCSI layer's problematic WRITE SAME heuristics by disabling WRITE SAME in the DM multipath device's queue_limits if an underlying device disabled it. The WRITE SAME heuristics, with both the original commit 5db44863b6eb ("[SCSI] sd: Implement support for WRITE SAME") and the updated commit 66c28f971 ("[SCSI] sd: Update WRITE SAME heuristics"), default to enabling WRITE SAME(10) even without successfully determining it is supported. After the first failed WRITE SAME the SCSI layer will disable WRITE SAME for the device (by setting sdkp->device->no_write_same which results in 'max_write_same_sectors' in device's queue_limits to be set to 0). When a device is stacked ontop of such a SCSI device any changes to that SCSI device's queue_limits do not automatically propagate up the stack. As such, a DM multipath device will not have its WRITE SAME support disabled. This causes the block layer to continue to issue WRITE SAME requests to the mpath device which causes paths to fail and (if mpath IO isn't configured to queue when no paths are available) it will result in actual IO errors to the upper layers. This fix doesn't help configurations that have additional devices stacked ontop of the mpath device (e.g. LVM created linear DM devices ontop). A proper fix that restacks all the queue_limits from the bottom of the device stack up will need to be explored if SCSI will continue to use this model of optimistically allowing op codes and then disabling them after they fail for the first time. Before this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: failing WRITE SAME IO with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. device-mapper: multipath: Failing path 8:112. end_request: I/O error, dev dm-6, sector 4616 dm-6: WRITE SAME failed. Manually zeroing. end_request: I/O error, dev dm-6, sector 4616 end_request: I/O error, dev dm-6, sector 5640 end_request: I/O error, dev dm-6, sector 6664 end_request: I/O error, dev dm-6, sector 7688 end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. end_request: I/O error, dev dm-6, sector 524296 Aborting journal on device dm-6-8. end_request: I/O error, dev dm-6, sector 524288 Buffer I/O error on device dm-6, logical block 65536 lost page write due to I/O error on dm-6 JBD2: Error -5 detected when updating journal superblock for dm-6-8. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 33553920 After this patch: EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: (null) device-mapper: multipath: XXX snitm debugging: got -EREMOTEIO (-121) device-mapper: multipath: XXX snitm debugging: WRITE SAME I/O failed with error=-121 end_request: critical target error, dev dm-6, sector 528 dm-6: WRITE SAME failed. Manually zeroing. # cat /sys/block/sdh/queue/write_same_max_bytes 0 # cat /sys/block/dm-6/queue/write_same_max_bytes 0 It should be noted that WRITE SAME support wasn't enabled in DM multipath until v3.10. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: Martin K. Petersen <martin.petersen@oracle.com> Cc: Hannes Reinecke <hare@suse.de> Cc: stable@vger.kernel.org # 3.10+
2013-09-19 16:13:58 +00:00
}
if (mpio->pgpath)
fail_path(mpio->pgpath);
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
if (!m->queue_if_no_path) {
if (!__must_push_back(m))
r = -EIO;
} else {
if (error == -EBADE)
r = error;
}
}
spin_unlock_irqrestore(&m->lock, flags);
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
return r;
}
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
static int multipath_end_io(struct dm_target *ti, struct request *clone,
int error, union map_info *map_context)
{
struct multipath *m = ti->private;
struct dm_mpath_io *mpio = map_context->ptr;
struct pgpath *pgpath;
struct path_selector *ps;
int r;
BUG_ON(!mpio);
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
r = do_end_io(m, clone, error, mpio);
pgpath = mpio->pgpath;
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
clear_mapinfo(m, map_context);
return r;
}
/*
* Suspend can't complete until all the I/O is processed so if
* the last path fails we must error any remaining I/O.
* Note that if the freeze_bdev fails while suspending, the
* queue_if_no_path state is lost - userspace should reset it.
*/
static void multipath_presuspend(struct dm_target *ti)
{
struct multipath *m = (struct multipath *) ti->private;
queue_if_no_path(m, 0, 1);
}
static void multipath_postsuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
mutex_lock(&m->work_mutex);
flush_multipath_work(m);
mutex_unlock(&m->work_mutex);
}
/*
* Restore the queue_if_no_path setting.
*/
static void multipath_resume(struct dm_target *ti)
{
struct multipath *m = (struct multipath *) ti->private;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
m->queue_if_no_path = m->saved_queue_if_no_path;
spin_unlock_irqrestore(&m->lock, flags);
}
/*
* Info output has the following format:
* num_multipath_feature_args [multipath_feature_args]*
* num_handler_status_args [handler_status_args]*
* num_groups init_group_number
* [A|D|E num_ps_status_args [ps_status_args]*
* num_paths num_selector_args
* [path_dev A|F fail_count [selector_args]* ]+ ]+
*
* Table output has the following format (identical to the constructor string):
* num_feature_args [features_args]*
* num_handler_args hw_handler [hw_handler_args]*
* num_groups init_group_number
* [priority selector-name num_ps_args [ps_args]*
* num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
*/
dm: fix truncated status strings Avoid returning a truncated table or status string instead of setting the DM_BUFFER_FULL_FLAG when the last target of a table fills the buffer. When processing a table or status request, the function retrieve_status calls ti->type->status. If ti->type->status returns non-zero, retrieve_status assumes that the buffer overflowed and sets DM_BUFFER_FULL_FLAG. However, targets don't return non-zero values from their status method on overflow. Most targets returns always zero. If a buffer overflow happens in a target that is not the last in the table, it gets noticed during the next iteration of the loop in retrieve_status; but if a buffer overflow happens in the last target, it goes unnoticed and erroneously truncated data is returned. In the current code, the targets behave in the following way: * dm-crypt returns -ENOMEM if there is not enough space to store the key, but it returns 0 on all other overflows. * dm-thin returns errors from the status method if a disk error happened. This is incorrect because retrieve_status doesn't check the error code, it assumes that all non-zero values mean buffer overflow. * all the other targets always return 0. This patch changes the ti->type->status function to return void (because most targets don't use the return code). Overflow is detected in retrieve_status: if the status method fills up the remaining space completely, it is assumed that buffer overflow happened. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2013-03-01 22:45:44 +00:00
static void multipath_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
int sz = 0;
unsigned long flags;
struct multipath *m = (struct multipath *) ti->private;
struct priority_group *pg;
struct pgpath *p;
unsigned pg_num;
char state;
spin_lock_irqsave(&m->lock, flags);
/* Features */
if (type == STATUSTYPE_INFO)
DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count);
else {
DMEMIT("%u ", m->queue_if_no_path +
(m->pg_init_retries > 0) * 2 +
(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
m->retain_attached_hw_handler);
if (m->queue_if_no_path)
DMEMIT("queue_if_no_path ");
if (m->pg_init_retries)
DMEMIT("pg_init_retries %u ", m->pg_init_retries);
if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
if (m->retain_attached_hw_handler)
DMEMIT("retain_attached_hw_handler ");
}
if (!m->hw_handler_name || type == STATUSTYPE_INFO)
DMEMIT("0 ");
else
DMEMIT("1 %s ", m->hw_handler_name);
DMEMIT("%u ", m->nr_priority_groups);
if (m->next_pg)
pg_num = m->next_pg->pg_num;
else if (m->current_pg)
pg_num = m->current_pg->pg_num;
else
pg_num = (m->nr_priority_groups ? 1 : 0);
DMEMIT("%u ", pg_num);
switch (type) {
case STATUSTYPE_INFO:
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed)
state = 'D'; /* Disabled */
else if (pg == m->current_pg)
state = 'A'; /* Currently Active */
else
state = 'E'; /* Enabled */
DMEMIT("%c ", state);
if (pg->ps.type->status)
sz += pg->ps.type->status(&pg->ps, NULL, type,
result + sz,
maxlen - sz);
else
DMEMIT("0 ");
DMEMIT("%u %u ", pg->nr_pgpaths,
pg->ps.type->info_args);
list_for_each_entry(p, &pg->pgpaths, list) {
DMEMIT("%s %s %u ", p->path.dev->name,
p->is_active ? "A" : "F",
p->fail_count);
if (pg->ps.type->status)
sz += pg->ps.type->status(&pg->ps,
&p->path, type, result + sz,
maxlen - sz);
}
}
break;
case STATUSTYPE_TABLE:
list_for_each_entry(pg, &m->priority_groups, list) {
DMEMIT("%s ", pg->ps.type->name);
if (pg->ps.type->status)
sz += pg->ps.type->status(&pg->ps, NULL, type,
result + sz,
maxlen - sz);
else
DMEMIT("0 ");
DMEMIT("%u %u ", pg->nr_pgpaths,
pg->ps.type->table_args);
list_for_each_entry(p, &pg->pgpaths, list) {
DMEMIT("%s ", p->path.dev->name);
if (pg->ps.type->status)
sz += pg->ps.type->status(&pg->ps,
&p->path, type, result + sz,
maxlen - sz);
}
}
break;
}
spin_unlock_irqrestore(&m->lock, flags);
}
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
int r = -EINVAL;
struct dm_dev *dev;
struct multipath *m = (struct multipath *) ti->private;
action_fn action;
mutex_lock(&m->work_mutex);
if (dm_suspended(ti)) {
r = -EBUSY;
goto out;
}
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, 1, 0);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, 0, 0);
goto out;
}
}
if (argc != 2) {
DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc);
goto out;
}
if (!strcasecmp(argv[0], "disable_group")) {
r = bypass_pg_num(m, argv[1], 1);
goto out;
} else if (!strcasecmp(argv[0], "enable_group")) {
r = bypass_pg_num(m, argv[1], 0);
goto out;
} else if (!strcasecmp(argv[0], "switch_group")) {
r = switch_pg_num(m, argv[1]);
goto out;
} else if (!strcasecmp(argv[0], "reinstate_path"))
action = reinstate_path;
else if (!strcasecmp(argv[0], "fail_path"))
action = fail_path;
else {
DMWARN("Unrecognised multipath message received: %s", argv[0]);
goto out;
}
r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
if (r) {
DMWARN("message: error getting device %s",
argv[1]);
goto out;
}
r = action_dev(m, dev, action);
dm_put_device(ti, dev);
out:
mutex_unlock(&m->work_mutex);
return r;
}
static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
unsigned long arg)
{
struct multipath *m = ti->private;
dm mpath: only retry ioctl when no paths if queue_if_no_path set When there are no paths and multipath receives an ioctl, it waits until a path becomes available. This behaviour is incorrect if the "queue_if_no_path" setting was not specified, as then the ioctl should be rejected immediately, which this patch now does. commit 35991652b ("dm mpath: allow ioctls to trigger pg init") should have checked if queue_if_no_path was configured before queueing IO. Checking for the queue_if_no_path feature, like is done in map_io(), allows the following table load to work without blocking in the multipath_ioctl retry loop: echo "0 1024 multipath 0 0 0 0" | dmsetup create mpath_nodevs Without this fix the multipath_ioctl will block with the following stack trace: blkid D 0000000000000002 0 23936 1 0x00000000 ffff8802b89e5cd8 0000000000000082 ffff8802b89e5fd8 0000000000012440 ffff8802b89e4010 0000000000012440 0000000000012440 0000000000012440 ffff8802b89e5fd8 0000000000012440 ffff88030c2aab30 ffff880325794040 Call Trace: [<ffffffff814ce099>] schedule+0x29/0x70 [<ffffffff814cc312>] schedule_timeout+0x182/0x2e0 [<ffffffff8104dee0>] ? lock_timer_base+0x70/0x70 [<ffffffff814cc48e>] schedule_timeout_uninterruptible+0x1e/0x20 [<ffffffff8104f840>] msleep+0x20/0x30 [<ffffffffa0000839>] multipath_ioctl+0x109/0x170 [dm_multipath] [<ffffffffa06bfb9c>] dm_blk_ioctl+0xbc/0xd0 [dm_mod] [<ffffffff8122a408>] __blkdev_driver_ioctl+0x28/0x30 [<ffffffff8122a79e>] blkdev_ioctl+0xce/0x730 [<ffffffff811970ac>] block_ioctl+0x3c/0x40 [<ffffffff8117321c>] do_vfs_ioctl+0x8c/0x340 [<ffffffff81166293>] ? sys_newfstat+0x33/0x40 [<ffffffff81173571>] sys_ioctl+0xa1/0xb0 [<ffffffff814d70a9>] system_call_fastpath+0x16/0x1b Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org # 3.5+ Acked-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2012-09-26 22:45:41 +00:00
struct pgpath *pgpath;
struct block_device *bdev;
fmode_t mode;
unsigned long flags;
int r;
bdev = NULL;
mode = 0;
r = 0;
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pgpath)
__choose_pgpath(m, 0);
dm mpath: only retry ioctl when no paths if queue_if_no_path set When there are no paths and multipath receives an ioctl, it waits until a path becomes available. This behaviour is incorrect if the "queue_if_no_path" setting was not specified, as then the ioctl should be rejected immediately, which this patch now does. commit 35991652b ("dm mpath: allow ioctls to trigger pg init") should have checked if queue_if_no_path was configured before queueing IO. Checking for the queue_if_no_path feature, like is done in map_io(), allows the following table load to work without blocking in the multipath_ioctl retry loop: echo "0 1024 multipath 0 0 0 0" | dmsetup create mpath_nodevs Without this fix the multipath_ioctl will block with the following stack trace: blkid D 0000000000000002 0 23936 1 0x00000000 ffff8802b89e5cd8 0000000000000082 ffff8802b89e5fd8 0000000000012440 ffff8802b89e4010 0000000000012440 0000000000012440 0000000000012440 ffff8802b89e5fd8 0000000000012440 ffff88030c2aab30 ffff880325794040 Call Trace: [<ffffffff814ce099>] schedule+0x29/0x70 [<ffffffff814cc312>] schedule_timeout+0x182/0x2e0 [<ffffffff8104dee0>] ? lock_timer_base+0x70/0x70 [<ffffffff814cc48e>] schedule_timeout_uninterruptible+0x1e/0x20 [<ffffffff8104f840>] msleep+0x20/0x30 [<ffffffffa0000839>] multipath_ioctl+0x109/0x170 [dm_multipath] [<ffffffffa06bfb9c>] dm_blk_ioctl+0xbc/0xd0 [dm_mod] [<ffffffff8122a408>] __blkdev_driver_ioctl+0x28/0x30 [<ffffffff8122a79e>] blkdev_ioctl+0xce/0x730 [<ffffffff811970ac>] block_ioctl+0x3c/0x40 [<ffffffff8117321c>] do_vfs_ioctl+0x8c/0x340 [<ffffffff81166293>] ? sys_newfstat+0x33/0x40 [<ffffffff81173571>] sys_ioctl+0xa1/0xb0 [<ffffffff814d70a9>] system_call_fastpath+0x16/0x1b Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org # 3.5+ Acked-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2012-09-26 22:45:41 +00:00
pgpath = m->current_pgpath;
if (pgpath) {
bdev = pgpath->path.dev->bdev;
mode = pgpath->path.dev->mode;
}
dm mpath: only retry ioctl when no paths if queue_if_no_path set When there are no paths and multipath receives an ioctl, it waits until a path becomes available. This behaviour is incorrect if the "queue_if_no_path" setting was not specified, as then the ioctl should be rejected immediately, which this patch now does. commit 35991652b ("dm mpath: allow ioctls to trigger pg init") should have checked if queue_if_no_path was configured before queueing IO. Checking for the queue_if_no_path feature, like is done in map_io(), allows the following table load to work without blocking in the multipath_ioctl retry loop: echo "0 1024 multipath 0 0 0 0" | dmsetup create mpath_nodevs Without this fix the multipath_ioctl will block with the following stack trace: blkid D 0000000000000002 0 23936 1 0x00000000 ffff8802b89e5cd8 0000000000000082 ffff8802b89e5fd8 0000000000012440 ffff8802b89e4010 0000000000012440 0000000000012440 0000000000012440 ffff8802b89e5fd8 0000000000012440 ffff88030c2aab30 ffff880325794040 Call Trace: [<ffffffff814ce099>] schedule+0x29/0x70 [<ffffffff814cc312>] schedule_timeout+0x182/0x2e0 [<ffffffff8104dee0>] ? lock_timer_base+0x70/0x70 [<ffffffff814cc48e>] schedule_timeout_uninterruptible+0x1e/0x20 [<ffffffff8104f840>] msleep+0x20/0x30 [<ffffffffa0000839>] multipath_ioctl+0x109/0x170 [dm_multipath] [<ffffffffa06bfb9c>] dm_blk_ioctl+0xbc/0xd0 [dm_mod] [<ffffffff8122a408>] __blkdev_driver_ioctl+0x28/0x30 [<ffffffff8122a79e>] blkdev_ioctl+0xce/0x730 [<ffffffff811970ac>] block_ioctl+0x3c/0x40 [<ffffffff8117321c>] do_vfs_ioctl+0x8c/0x340 [<ffffffff81166293>] ? sys_newfstat+0x33/0x40 [<ffffffff81173571>] sys_ioctl+0xa1/0xb0 [<ffffffff814d70a9>] system_call_fastpath+0x16/0x1b Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org # 3.5+ Acked-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2012-09-26 22:45:41 +00:00
if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
r = -ENOTCONN;
else if (!bdev)
r = -EIO;
spin_unlock_irqrestore(&m->lock, flags);
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
int err = scsi_verify_blk_ioctl(NULL, cmd);
if (err)
r = err;
}
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pg) {
/* Path status changed, redo selection */
__choose_pgpath(m, 0);
}
if (m->pg_init_required)
__pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
dm_table_run_md_queue_async(m->ti->table);
}
return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
static int multipath_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *p;
int ret = 0;
list_for_each_entry(pg, &m->priority_groups, list) {
list_for_each_entry(p, &pg->pgpaths, list) {
ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
if (ret)
goto out;
}
}
out:
return ret;
}
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
static int __pgpath_busy(struct pgpath *pgpath)
{
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
return dm_underlying_device_busy(q);
}
/*
* We return "busy", only when we can map I/Os but underlying devices
* are busy (so even if we map I/Os now, the I/Os will wait on
* the underlying queue).
* In other words, if we want to kill I/Os or queue them inside us
* due to map unavailability, we don't return "busy". Otherwise,
* dm core won't give us the I/Os and we can't do what we want.
*/
static int multipath_busy(struct dm_target *ti)
{
int busy = 0, has_active = 0;
struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *pgpath;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
/* pg_init in progress, requeue until done */
if (!pg_ready(m)) {
busy = 1;
goto out;
}
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
/* Guess which priority_group will be used at next mapping time */
if (unlikely(!m->current_pgpath && m->next_pg))
pg = m->next_pg;
else if (likely(m->current_pg))
pg = m->current_pg;
else
/*
* We don't know which pg will be used at next mapping time.
* We don't call __choose_pgpath() here to avoid to trigger
* pg_init just by busy checking.
* So we don't know whether underlying devices we will be using
* at next mapping time are busy or not. Just try mapping.
*/
goto out;
/*
* If there is one non-busy active path at least, the path selector
* will be able to select it. So we consider such a pg as not busy.
*/
busy = 1;
list_for_each_entry(pgpath, &pg->pgpaths, list)
if (pgpath->is_active) {
has_active = 1;
if (!__pgpath_busy(pgpath)) {
busy = 0;
break;
}
}
if (!has_active)
/*
* No active path in this pg, so this pg won't be used and
* the current_pg will be changed at next mapping time.
* We need to try mapping to determine it.
*/
busy = 0;
out:
spin_unlock_irqrestore(&m->lock, flags);
return busy;
}
/*-----------------------------------------------------------------
* Module setup
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 7, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
.map_rq = multipath_map,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
.ioctl = multipath_ioctl,
.iterate_devices = multipath_iterate_devices,
dm mpath: change to be request based This patch converts dm-multipath target to request-based from bio-based. Basically, the patch just converts the I/O unit from struct bio to struct request. In the course of the conversion, it also changes the I/O queueing mechanism. The change in the I/O queueing is described in details as follows. I/O queueing mechanism change ----------------------------- In I/O submission, map_io(), there is no mechanism change from bio-based, since the clone request is ready for retry as it is. However, in I/O complition, do_end_io(), there is a mechanism change from bio-based, since the clone request is not ready for retry. In do_end_io() of bio-based, the clone bio has all needed memory for resubmission. So the target driver can queue it and resubmit it later without memory allocations. The mechanism has almost no overhead. On the other hand, in do_end_io() of request-based, the clone request doesn't have clone bios, so the target driver can't resubmit it as it is. To resubmit the clone request, memory allocation for clone bios is needed, and it takes some overheads. To avoid the overheads just for queueing, the target driver doesn't queue the clone request inside itself. Instead, the target driver asks dm core for queueing and remapping the original request of the clone request, since the overhead for queueing is just a freeing memory for the clone request. As a result, the target driver doesn't need to record/restore the information of the original request for resubmitting the clone request. So dm_bio_details in dm_mpath_io is removed. multipath_busy() --------------------- The target driver returns "busy", only when the following case: o The target driver will map I/Os, if map() function is called and o The mapped I/Os will wait on underlying device's queue due to their congestions, if map() function is called now. In other cases, the target driver doesn't return "busy". Otherwise, dm core will keep the I/Os and the target driver can't do what it wants. (e.g. the target driver can't map I/Os now, so wants to kill I/Os.) Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Acked-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:37 +00:00
.busy = multipath_busy,
};
static int __init dm_multipath_init(void)
{
int r;
/* allocate a slab for the dm_ios */
_mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
if (!_mpio_cache)
return -ENOMEM;
r = dm_register_target(&multipath_target);
if (r < 0) {
DMERR("register failed %d", r);
kmem_cache_destroy(_mpio_cache);
return -EINVAL;
}
kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
if (!kmultipathd) {
DMERR("failed to create workqueue kmpathd");
dm_unregister_target(&multipath_target);
kmem_cache_destroy(_mpio_cache);
return -ENOMEM;
}
/*
* A separate workqueue is used to handle the device handlers
* to avoid overloading existing workqueue. Overloading the
* old workqueue would also create a bottleneck in the
* path of the storage hardware device activation.
*/
kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
WQ_MEM_RECLAIM);
if (!kmpath_handlerd) {
DMERR("failed to create workqueue kmpath_handlerd");
destroy_workqueue(kmultipathd);
dm_unregister_target(&multipath_target);
kmem_cache_destroy(_mpio_cache);
return -ENOMEM;
}
DMINFO("version %u.%u.%u loaded",
multipath_target.version[0], multipath_target.version[1],
multipath_target.version[2]);
return r;
}
static void __exit dm_multipath_exit(void)
{
destroy_workqueue(kmpath_handlerd);
destroy_workqueue(kmultipathd);
dm_unregister_target(&multipath_target);
kmem_cache_destroy(_mpio_cache);
}
module_init(dm_multipath_init);
module_exit(dm_multipath_exit);
MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");