linux/drivers/md/dm-switch.c

588 lines
15 KiB
C
Raw Normal View History

/*
* Copyright (C) 2010-2012 by Dell Inc. All rights reserved.
* Copyright (C) 2011-2013 Red Hat, Inc.
*
* This file is released under the GPL.
*
* dm-switch is a device-mapper target that maps IO to underlying block
* devices efficiently when there are a large number of fixed-sized
* address regions but there is no simple pattern to allow for a compact
* mapping representation such as dm-stripe.
*/
#include <linux/device-mapper.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "switch"
/*
* One region_table_slot_t holds <region_entries_per_slot> region table
* entries each of which is <region_table_entry_bits> in size.
*/
typedef unsigned long region_table_slot_t;
/*
* A device with the offset to its start sector.
*/
struct switch_path {
struct dm_dev *dmdev;
sector_t start;
};
/*
* Context block for a dm switch device.
*/
struct switch_ctx {
struct dm_target *ti;
unsigned nr_paths; /* Number of paths in path_list. */
unsigned region_size; /* Region size in 512-byte sectors */
unsigned long nr_regions; /* Number of regions making up the device */
signed char region_size_bits; /* log2 of region_size or -1 */
unsigned char region_table_entry_bits; /* Number of bits in one region table entry */
unsigned char region_entries_per_slot; /* Number of entries in one region table slot */
signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */
region_table_slot_t *region_table; /* Region table */
/*
* Array of dm devices to switch between.
*/
dm: replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2020-05-07 18:51:58 +00:00
struct switch_path path_list[];
};
static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
unsigned region_size)
{
struct switch_ctx *sctx;
sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
if (!sctx)
return NULL;
sctx->ti = ti;
sctx->region_size = region_size;
ti->private = sctx;
return sctx;
}
static int alloc_region_table(struct dm_target *ti, unsigned nr_paths)
{
struct switch_ctx *sctx = ti->private;
sector_t nr_regions = ti->len;
sector_t nr_slots;
if (!(sctx->region_size & (sctx->region_size - 1)))
sctx->region_size_bits = __ffs(sctx->region_size);
else
sctx->region_size_bits = -1;
sctx->region_table_entry_bits = 1;
while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
(region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
sctx->region_table_entry_bits++;
sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
else
sctx->region_entries_per_slot_bits = -1;
if (sector_div(nr_regions, sctx->region_size))
nr_regions++;
if (nr_regions >= ULONG_MAX) {
ti->error = "Region table too large";
return -EINVAL;
}
sctx->nr_regions = nr_regions;
nr_slots = nr_regions;
if (sector_div(nr_slots, sctx->region_entries_per_slot))
nr_slots++;
if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
ti->error = "Region table too large";
return -EINVAL;
}
treewide: Use array_size() in vmalloc() The vmalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: vmalloc(a * b) with: vmalloc(array_size(a, b)) as well as handling cases of: vmalloc(a * b * c) with: vmalloc(array3_size(a, b, c)) This does, however, attempt to ignore constant size factors like: vmalloc(4 * 1024) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( vmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | vmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( vmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | vmalloc( - sizeof(u8) * COUNT + COUNT , ...) | vmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | vmalloc( - sizeof(char) * COUNT + COUNT , ...) | vmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( vmalloc( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | vmalloc( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ vmalloc( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( vmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | vmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | vmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( vmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | vmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( vmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | vmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( vmalloc(C1 * C2 * C3, ...) | vmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( vmalloc(C1 * C2, ...) | vmalloc( - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 21:27:11 +00:00
sctx->region_table = vmalloc(array_size(nr_slots,
sizeof(region_table_slot_t)));
if (!sctx->region_table) {
ti->error = "Cannot allocate region table";
return -ENOMEM;
}
return 0;
}
static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
unsigned long *region_index, unsigned *bit)
{
if (sctx->region_entries_per_slot_bits >= 0) {
*region_index = region_nr >> sctx->region_entries_per_slot_bits;
*bit = region_nr & (sctx->region_entries_per_slot - 1);
} else {
*region_index = region_nr / sctx->region_entries_per_slot;
*bit = region_nr % sctx->region_entries_per_slot;
}
*bit *= sctx->region_table_entry_bits;
}
static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
{
unsigned long region_index;
unsigned bit;
switch_get_position(sctx, region_nr, &region_index, &bit);
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-23 21:07:29 +00:00
return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
((1 << sctx->region_table_entry_bits) - 1);
}
/*
* Find which path to use at given offset.
*/
static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
{
unsigned path_nr;
sector_t p;
p = offset;
if (sctx->region_size_bits >= 0)
p >>= sctx->region_size_bits;
else
sector_div(p, sctx->region_size);
path_nr = switch_region_table_read(sctx, p);
/* This can only happen if the processor uses non-atomic stores. */
if (unlikely(path_nr >= sctx->nr_paths))
path_nr = 0;
return path_nr;
}
static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
unsigned value)
{
unsigned long region_index;
unsigned bit;
region_table_slot_t pte;
switch_get_position(sctx, region_nr, &region_index, &bit);
pte = sctx->region_table[region_index];
pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
pte |= (region_table_slot_t)value << bit;
sctx->region_table[region_index] = pte;
}
/*
* Fill the region table with an initial round robin pattern.
*/
static void initialise_region_table(struct switch_ctx *sctx)
{
unsigned path_nr = 0;
unsigned long region_nr;
for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
switch_region_table_write(sctx, region_nr, path_nr);
if (++path_nr >= sctx->nr_paths)
path_nr = 0;
}
}
static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
{
struct switch_ctx *sctx = ti->private;
unsigned long long start;
int r;
r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
&sctx->path_list[sctx->nr_paths].dmdev);
if (r) {
ti->error = "Device lookup failed";
return r;
}
if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
ti->error = "Invalid device starting offset";
dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
return -EINVAL;
}
sctx->path_list[sctx->nr_paths].start = start;
sctx->nr_paths++;
return 0;
}
/*
* Destructor: Don't free the dm_target, just the ti->private data (if any).
*/
static void switch_dtr(struct dm_target *ti)
{
struct switch_ctx *sctx = ti->private;
while (sctx->nr_paths--)
dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
vfree(sctx->region_table);
kfree(sctx);
}
/*
* Constructor arguments:
* <num_paths> <region_size> <num_optional_args> [<optional_args>...]
* [<dev_path> <offset>]+
*
* Optional args are to allow for future extension: currently this
* parameter must be 0.
*/
static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
static const struct dm_arg _args[] = {
{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
{1, UINT_MAX, "Invalid region size"},
{0, 0, "Invalid number of optional args"},
};
struct switch_ctx *sctx;
struct dm_arg_set as;
unsigned nr_paths, region_size, nr_optional_args;
int r;
as.argc = argc;
as.argv = argv;
r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
if (r)
return -EINVAL;
r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
if (r)
return r;
r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
if (r)
return r;
/* parse optional arguments here, if we add any */
if (as.argc != nr_paths * 2) {
ti->error = "Incorrect number of path arguments";
return -EINVAL;
}
sctx = alloc_switch_ctx(ti, nr_paths, region_size);
if (!sctx) {
ti->error = "Cannot allocate redirection context";
return -ENOMEM;
}
r = dm_set_target_max_io_len(ti, region_size);
if (r)
goto error;
while (as.argc) {
r = parse_path(&as, ti);
if (r)
goto error;
}
r = alloc_region_table(ti, nr_paths);
if (r)
goto error;
initialise_region_table(sctx);
/* For UNMAP, sending the request down any path is sufficient */
ti->num_discard_bios = 1;
return 0;
error:
switch_dtr(ti);
return r;
}
static int switch_map(struct dm_target *ti, struct bio *bio)
{
struct switch_ctx *sctx = ti->private;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
unsigned path_nr = switch_get_path_nr(sctx, offset);
bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
return DM_MAPIO_REMAPPED;
}
/*
* We need to parse hex numbers in the message as quickly as possible.
*
* This table-based hex parser improves performance.
* It improves a time to load 1000000 entries compared to the condition-based
* parser.
* table-based parser condition-based parser
* PA-RISC 0.29s 0.31s
* Opteron 0.0495s 0.0498s
*/
static const unsigned char hex_table[256] = {
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
};
static __always_inline unsigned long parse_hex(const char **string)
{
unsigned char d;
unsigned long r = 0;
while ((d = hex_table[(unsigned char)**string]) < 16) {
r = (r << 4) | d;
(*string)++;
}
return r;
}
static int process_set_region_mappings(struct switch_ctx *sctx,
unsigned argc, char **argv)
{
unsigned i;
unsigned long region_index = 0;
for (i = 1; i < argc; i++) {
unsigned long path_nr;
const char *string = argv[i];
if ((*string & 0xdf) == 'R') {
unsigned long cycle_length, num_write;
string++;
if (unlikely(*string == ',')) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
cycle_length = parse_hex(&string);
if (unlikely(*string != ',')) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
string++;
if (unlikely(!*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
num_write = parse_hex(&string);
if (unlikely(*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
cycle_length - 1, region_index);
return -EINVAL;
}
if (unlikely(region_index + num_write < region_index) ||
unlikely(region_index + num_write >= sctx->nr_regions)) {
DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
region_index, num_write, sctx->nr_regions);
return -EINVAL;
}
while (num_write--) {
region_index++;
path_nr = switch_region_table_read(sctx, region_index - cycle_length);
switch_region_table_write(sctx, region_index, path_nr);
}
continue;
}
if (*string == ':')
region_index++;
else {
region_index = parse_hex(&string);
if (unlikely(*string != ':')) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
}
string++;
if (unlikely(!*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
path_nr = parse_hex(&string);
if (unlikely(*string)) {
DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
return -EINVAL;
}
if (unlikely(region_index >= sctx->nr_regions)) {
DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
return -EINVAL;
}
if (unlikely(path_nr >= sctx->nr_paths)) {
DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
return -EINVAL;
}
switch_region_table_write(sctx, region_index, path_nr);
}
return 0;
}
/*
* Messages are processed one-at-a-time.
*
* Only set_region_mappings is supported.
*/
static int switch_message(struct dm_target *ti, unsigned argc, char **argv,
char *result, unsigned maxlen)
{
static DEFINE_MUTEX(message_mutex);
struct switch_ctx *sctx = ti->private;
int r = -EINVAL;
mutex_lock(&message_mutex);
if (!strcasecmp(argv[0], "set_region_mappings"))
r = process_set_region_mappings(sctx, argc, argv);
else
DMWARN("Unrecognised message received.");
mutex_unlock(&message_mutex);
return r;
}
static void switch_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
struct switch_ctx *sctx = ti->private;
unsigned sz = 0;
int path_nr;
switch (type) {
case STATUSTYPE_INFO:
result[0] = '\0';
break;
case STATUSTYPE_TABLE:
DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
(unsigned long long)sctx->path_list[path_nr].start);
break;
}
}
/*
* Switch ioctl:
*
* Passthrough all ioctls to the path for sector 0
*/
static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct switch_ctx *sctx = ti->private;
unsigned path_nr;
path_nr = switch_get_path_nr(sctx, 0);
*bdev = sctx->path_list[path_nr].dmdev->bdev;
/*
* Only pass ioctls through if the device sizes match exactly.
*/
if (ti->len + sctx->path_list[path_nr].start !=
i_size_read((*bdev)->bd_inode) >> SECTOR_SHIFT)
return 1;
return 0;
}
static int switch_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct switch_ctx *sctx = ti->private;
int path_nr;
int r;
for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
r = fn(ti, sctx->path_list[path_nr].dmdev,
sctx->path_list[path_nr].start, ti->len, data);
if (r)
return r;
}
return 0;
}
static struct target_type switch_target = {
.name = "switch",
.version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = switch_ctr,
.dtr = switch_dtr,
.map = switch_map,
.message = switch_message,
.status = switch_status,
.prepare_ioctl = switch_prepare_ioctl,
.iterate_devices = switch_iterate_devices,
};
static int __init dm_switch_init(void)
{
int r;
r = dm_register_target(&switch_target);
if (r < 0)
DMERR("dm_register_target() failed %d", r);
return r;
}
static void __exit dm_switch_exit(void)
{
dm_unregister_target(&switch_target);
}
module_init(dm_switch_init);
module_exit(dm_switch_exit);
MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
MODULE_LICENSE("GPL");