linux/drivers/md/dm.h

239 lines
7.0 KiB
C
Raw Normal View History

/*
* Internal header file for device mapper
*
* Copyright (C) 2001, 2002 Sistina Software
* Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
#ifndef DM_INTERNAL_H
#define DM_INTERNAL_H
#include <linux/fs.h>
#include <linux/device-mapper.h>
#include <linux/list.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/completion.h>
#include <linux/kobject.h>
#include "dm-stats.h"
/*
* Suspend feature flags
*/
#define DM_SUSPEND_LOCKFS_FLAG (1 << 0)
#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
/*
* Status feature flags
*/
#define DM_STATUS_NOFLUSH_FLAG (1 << 0)
/*
* Type of table and mapped_device's mempool
*/
2014-12-18 02:08:12 +00:00
#define DM_TYPE_NONE 0
#define DM_TYPE_BIO_BASED 1
#define DM_TYPE_REQUEST_BASED 2
#define DM_TYPE_MQ_REQUEST_BASED 3
/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev_internal {
struct list_head list;
atomic_t count;
dm: allow active and inactive tables to share dm_devs Until this change, when loading a new DM table, DM core would re-open all of the devices in the DM table. Now, DM core will avoid redundant device opens (and closes when destroying the old table) if the old table already has a device open using the same mode. This is achieved by managing reference counts on the table_devices that DM core now stores in the mapped_device structure (rather than in the dm_table structure). So a mapped_device's active and inactive dm_tables' dm_dev lists now just point to the dm_devs stored in the mapped_device's table_devices list. This improvement in DM core's device reference counting has the side-effect of fixing a long-standing limitation of the multipath target: a DM multipath table couldn't include any paths that were unusable (failed). For example: if all paths have failed and you add a new, working, path to the table; you can't use it since the table load would fail due to it still containing failed paths. Now a re-load of a multipath table can include failed devices and when those devices become active again they can be used instantly. The device list code in dm.c isn't a straight copy/paste from the code in dm-table.c, but it's very close (aside from some variable renames). One subtle difference is that find_table_device for the tables_devices list will only match devices with the same name and mode. This is because we don't want to upgrade a device's mode in the active table when an inactive table is loaded. Access to the mapped_device structure's tables_devices list requires a mutex (tables_devices_lock), so that tables cannot be created and destroyed concurrently. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2014-08-13 18:53:43 +00:00
struct dm_dev *dm_dev;
};
struct dm_table;
struct dm_md_mempools;
/*-----------------------------------------------------------------
* Internal table functions.
*---------------------------------------------------------------*/
dm table: rework reference counting Rework table reference counting. The existing code uses a reference counter. When the last reference is dropped and the counter reaches zero, the table destructor is called. Table reference counters are acquired/released from upcalls from other kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all). If the reference counter reaches zero in one of the upcalls, the table destructor is called from almost random kernel code. This leads to various problems: * dm_any_congested being called under a spinlock, which calls the destructor, which calls some sleeping function. * the destructor attempting to take a lock that is already taken by the same process. * stale reference from some other kernel code keeps the table constructed, which keeps some devices open, even after successful return from "dmsetup remove". This can confuse lvm and prevent closing of underlying devices or reusing device minor numbers. The patch changes reference counting so that the table destructor can be called only at predetermined places. The table has always exactly one reference from either mapped_device->map or hash_cell->new_map. After this patch, this reference is not counted in table->holders. A pair of dm_create_table/dm_destroy_table functions is used for table creation/destruction. Temporary references from the other code increase table->holders. A pair of dm_table_get/dm_table_put functions is used to manipulate it. When the table is about to be destroyed, we wait for table->holders to reach 0. Then, we call the table destructor. We use active waiting with msleep(1), because the situation happens rarely (to one user in 5 years) and removing the device isn't performance-critical task: the user doesn't care if it takes one tick more or not. This way, the destructor is called only at specific points (dm_table_destroy function) and the above problems associated with lazy destruction can't happen. Finally remove the temporary protection added to dm_any_congested(). Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-01-06 03:05:10 +00:00
void dm_table_destroy(struct dm_table *t);
void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context);
struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
bool dm_table_has_no_data_devices(struct dm_table *table);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits);
struct list_head *dm_table_get_devices(struct dm_table *t);
void dm_table_presuspend_targets(struct dm_table *t);
void dm_table_presuspend_undo_targets(struct dm_table *t);
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
dm: prepare for request based option This patch adds core functions for request-based dm. When struct mapped device (md) is initialized, md->queue has an I/O scheduler and the following functions are used for request-based dm as the queue functions: make_request_fn: dm_make_request() pref_fn: dm_prep_fn() request_fn: dm_request_fn() softirq_done_fn: dm_softirq_done() lld_busy_fn: dm_lld_busy() Actual initializations are done in another patch (PATCH 2). Below is a brief summary of how request-based dm behaves, including: - making request from bio - cloning, mapping and dispatching request - completing request and bio - suspending md - resuming md bio to request ============== md->queue->make_request_fn() (dm_make_request()) calls __make_request() for a bio submitted to the md. Then, the bio is kept in the queue as a new request or merged into another request in the queue if possible. Cloning and Mapping =================== Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()), when requests are dispatched after they are sorted by the I/O scheduler. dm_request_fn() checks busy state of underlying devices using target's busy() function and stops dispatching requests to keep them on the dm device's queue if busy. It helps better I/O merging, since no merge is done for a request once it is dispatched to underlying devices. Actual cloning and mapping are done in dm_prep_fn() and map_request() called from dm_request_fn(). dm_prep_fn() clones not only request but also bios of the request so that dm can hold bio completion in error cases and prevent the bio submitter from noticing the error. (See the "Completion" section below for details.) After the cloning, the clone is mapped by target's map_rq() function and inserted to underlying device's queue using blk_insert_cloned_request(). Completion ========== Request completion can be hooked by rq->end_io(), but then, all bios in the request will have been completed even error cases, and the bio submitter will have noticed the error. To prevent the bio completion in error cases, request-based dm clones both bio and request and hooks both bio->bi_end_io() and rq->end_io(): bio->bi_end_io(): end_clone_bio() rq->end_io(): end_clone_request() Summary of the request completion flow is below: blk_end_request() for a clone request => blk_update_request() => bio->bi_end_io() == end_clone_bio() for each clone bio => Free the clone bio => Success: Complete the original bio (blk_update_request()) Error: Don't complete the original bio => blk_finish_request() => rq->end_io() == end_clone_request() => blk_complete_request() => dm_softirq_done() => Free the clone request => Success: Complete the original request (blk_end_request()) Error: Requeue the original request end_clone_bio() completes the original request on the size of the original bio in successful cases. Even if all bios in the original request are completed by that completion, the original request must not be completed yet to keep the ordering of request completion for the stacking. So end_clone_bio() uses blk_update_request() instead of blk_end_request(). In error cases, end_clone_bio() doesn't complete the original bio. It just frees the cloned bio and gives over the error handling to end_clone_request(). end_clone_request(), which is called with queue lock held, completes the clone request and the original request in a softirq context (dm_softirq_done()), which has no queue lock, to avoid a deadlock issue on submission of another request during the completion: - The submitted request may be mapped to the same device - Request submission requires queue lock, but the queue lock has been held by itself and it doesn't know that The clone request has no clone bio when dm_softirq_done() is called. So target drivers can't resubmit it again even error cases. Instead, they can ask dm core for requeueing and remapping the original request in that cases. suspend ======= Request-based dm uses stopping md->queue as suspend of the md. For noflush suspend, just stops md->queue. For flush suspend, inserts a marker request to the tail of md->queue. And dispatches all requests in md->queue until the marker comes to the front of md->queue. Then, stops dispatching request and waits for the all dispatched requests to complete. After that, completes the marker request, stops md->queue and wake up the waiter on the suspend queue, md->wait. resume ====== Starts md->queue. Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2009-06-22 09:12:35 +00:00
int dm_table_any_busy_target(struct dm_table *t);
unsigned dm_table_get_type(struct dm_table *t);
struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
2014-12-18 02:08:12 +00:00
bool dm_table_mq_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
int dm_queue_merge_is_compulsory(struct request_queue *q);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
void dm_set_md_type(struct mapped_device *md, unsigned type);
unsigned dm_get_md_type(struct mapped_device *md);
struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
dm: do not initialise full request queue when bio based Change bio-based mapped devices no longer to have a fully initialized request_queue (request_fn, elevator, etc). This means bio-based DM devices no longer register elevator sysfs attributes ('iosched/' tree or 'scheduler' other than "none"). In contrast, a request-based DM device will continue to have a full request_queue and will register elevator sysfs attributes. Therefore a user can determine a DM device's type by checking if elevator sysfs attributes exist. First allocate a minimalist request_queue structure for a DM device (needed for both bio and request-based DM). Initialization of a full request_queue is deferred until it is known that the DM device is request-based, at the end of the table load sequence. Factor DM device's request_queue initialization: - common to both request-based and bio-based into dm_init_md_queue(). - specific to request-based into dm_init_request_based_queue(). The md->type_lock mutex is used to protect md->queue, in addition to md->type, during table_load(). A DM device's first table_load will establish the immutable md->type. But md->queue initialization, based on md->type, may fail at that time (because blk_init_allocated_queue cannot allocate memory). Therefore any subsequent table_load must (re)try dm_setup_md_queue independently of establishing md->type. Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 03:14:02 +00:00
int dm_setup_md_queue(struct mapped_device *md);
/*
* To check the return value from dm_table_find_target().
*/
#define dm_target_is_valid(t) ((t)->table)
/*
* To check whether the target type is bio-based or not (request-based).
*/
#define dm_target_bio_based(t) ((t)->type->map != NULL)
/*
* To check whether the target type is request-based or not (bio-based).
*/
#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \
((t)->type->clone_and_map_rq != NULL))
/*
* To check whether the target type is a hybrid (capable of being
* either request-based or bio-based).
*/
#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
/*-----------------------------------------------------------------
* A registry of target types.
*---------------------------------------------------------------*/
int dm_target_init(void);
void dm_target_exit(void);
struct target_type *dm_get_target_type(const char *name);
void dm_put_target_type(struct target_type *tt);
int dm_target_iterate(void (*iter_func)(struct target_type *tt,
void *param), void *param);
int dm_split_args(int *argc, char ***argvp, char *input);
/*
* Is this mapped_device being deleted?
*/
int dm_deleting_md(struct mapped_device *md);
/*
* Is this mapped_device suspended?
*/
int dm_suspended_md(struct mapped_device *md);
dm: enhance internal suspend and resume interface Rename dm_internal_{suspend,resume} to dm_internal_{suspend,resume}_fast -- dm-stats will continue using these methods to avoid all the extra suspend/resume logic that is not needed in order to quickly flush IO. Introduce dm_internal_suspend_noflush() variant that actually calls the mapped_device's target callbacks -- otherwise target-specific hooks are avoided (e.g. dm-thin's thin_presuspend and thin_postsuspend). Common code between dm_internal_{suspend_noflush,resume} and dm_{suspend,resume} was factored out as __dm_{suspend,resume}. Update dm_internal_{suspend_noflush,resume} to always take and release the mapped_device's suspend_lock. Also update dm_{suspend,resume} to be aware of potential for DM_INTERNAL_SUSPEND_FLAG to be set and respond accordingly by interruptibly waiting for the DM_INTERNAL_SUSPEND_FLAG to be cleared. Add lockdep annotation to dm_suspend() and dm_resume(). The existing DM_SUSPEND_FLAG remains unchanged. DM_INTERNAL_SUSPEND_FLAG is set by dm_internal_suspend_noflush() and cleared by dm_internal_resume(). Both DM_SUSPEND_FLAG and DM_INTERNAL_SUSPEND_FLAG may be set if a device was already suspended when dm_internal_suspend_noflush() was called -- this can be thought of as a "nested suspend". A "nested suspend" can occur with legacy userspace dm-thin code that might suspend all active thin volumes before suspending the pool for resize. But otherwise, in the normal dm-thin-pool suspend case moving forward: the thin-pool will have DM_SUSPEND_FLAG set and all active thins from that thin-pool will have DM_INTERNAL_SUSPEND_FLAG set. Also add DM_INTERNAL_SUSPEND_FLAG to status report. This new DM_INTERNAL_SUSPEND_FLAG state is being reported to assist with debugging (e.g. 'dmsetup info' will report an internally suspended device accordingly). Signed-off-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Joe Thornber <ejt@redhat.com>
2014-10-28 22:34:52 +00:00
/*
* Internal suspend and resume methods.
*/
int dm_suspended_internally_md(struct mapped_device *md);
void dm_internal_suspend_fast(struct mapped_device *md);
void dm_internal_resume_fast(struct mapped_device *md);
void dm_internal_suspend_noflush(struct mapped_device *md);
void dm_internal_resume(struct mapped_device *md);
/*
* Test if the device is scheduled for deferred remove.
*/
int dm_test_deferred_remove_flag(struct mapped_device *md);
/*
* Try to remove devices marked for deferred removal.
*/
void dm_deferred_remove(void);
/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
*/
int dm_interface_init(void);
void dm_interface_exit(void);
/*
* sysfs interface
*/
struct dm_kobject_holder {
struct kobject kobj;
struct completion completion;
};
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
}
int dm_sysfs_init(struct mapped_device *md);
void dm_sysfs_exit(struct mapped_device *md);
struct kobject *dm_kobject(struct mapped_device *md);
struct mapped_device *dm_get_from_kobject(struct kobject *kobj);
/*
* The kobject helper
*/
void dm_kobject_release(struct kobject *kobj);
/*
* Targets for linear and striped mappings
*/
int dm_linear_init(void);
void dm_linear_exit(void);
int dm_stripe_init(void);
void dm_stripe_exit(void);
dm: separate device deletion from dm_put This patch separates the device deletion code from dm_put() to make sure the deletion happens in the process context. By this patch, device deletion always occurs in an ioctl (process) context and dm_put() can be called in interrupt context. As a result, the request-based dm's bad dm_put() usage pointed out by Mikulas below disappears. http://marc.info/?l=dm-devel&m=126699981019735&w=2 Without this patch, I confirmed there is a case to crash the system: dm_put() => dm_table_destroy() => vfree() => BUG_ON(in_interrupt()) Some more backgrounds and details: In request-based dm, a device opener can remove a mapped_device while the last request is still completing, because bios in the last request complete first and then the device opener can close and remove the mapped_device before the last request completes: CPU0 CPU1 ================================================================= <<INTERRUPT>> blk_end_request_all(clone_rq) blk_update_request(clone_rq) bio_endio(clone_bio) == end_clone_bio blk_update_request(orig_rq) bio_endio(orig_bio) <<I/O completed>> dm_blk_close() dev_remove() dm_put(md) <<Free md>> blk_finish_request(clone_rq) .... dm_end_request(clone_rq) free_rq_clone(clone_rq) blk_end_request_all(orig_rq) rq_completed(md) So request-based dm used dm_get()/dm_put() to hold md for each I/O until its request completion handling is fully done. However, the final dm_put() can call the device deletion code which must not be run in interrupt context and may cause kernel panic. To solve the problem, this patch moves the device deletion code, dm_destroy(), to predetermined places that is actually deleting the mapped_device in ioctl (process) context, and changes dm_put() just to decrement the reference count of the mapped_device. By this change, dm_put() can be used in any context and the symmetric model below is introduced: dm_create(): create a mapped_device dm_destroy(): destroy a mapped_device dm_get(): increment the reference count of a mapped_device dm_put(): decrement the reference count of a mapped_device dm_destroy() waits for all references of the mapped_device to disappear, then deletes the mapped_device. dm_destroy() uses active waiting with msleep(1), since deleting the mapped_device isn't performance-critical task. And since at this point, nobody opens the mapped_device and no new reference will be taken, the pending counts are just for racing completing activity and will eventually decrease to zero. For the unlikely case of the forced module unload, dm_destroy_immediate(), which doesn't wait and forcibly deletes the mapped_device, is also introduced and used in dm_hash_remove_all(). Otherwise, "rmmod -f" may be stuck and never return. And now, because the mapped_device is deleted at this point, subsequent accesses to the mapped_device may cause NULL pointer references. Cc: stable@kernel.org Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com> Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
2010-08-12 03:13:56 +00:00
/*
* mapped_device operations
*/
void dm_destroy(struct mapped_device *md);
void dm_destroy_immediate(struct mapped_device *md);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
int dm_cancel_deferred_remove(struct mapped_device *md);
int dm_request_based(struct mapped_device *md);
sector_t dm_get_size(struct mapped_device *md);
struct request_queue *dm_get_md_queue(struct mapped_device *md);
dm: allow active and inactive tables to share dm_devs Until this change, when loading a new DM table, DM core would re-open all of the devices in the DM table. Now, DM core will avoid redundant device opens (and closes when destroying the old table) if the old table already has a device open using the same mode. This is achieved by managing reference counts on the table_devices that DM core now stores in the mapped_device structure (rather than in the dm_table structure). So a mapped_device's active and inactive dm_tables' dm_dev lists now just point to the dm_devs stored in the mapped_device's table_devices list. This improvement in DM core's device reference counting has the side-effect of fixing a long-standing limitation of the multipath target: a DM multipath table couldn't include any paths that were unusable (failed). For example: if all paths have failed and you add a new, working, path to the table; you can't use it since the table load would fail due to it still containing failed paths. Now a re-load of a multipath table can include failed devices and when those devices become active again they can be used instantly. The device list code in dm.c isn't a straight copy/paste from the code in dm-table.c, but it's very close (aside from some variable renames). One subtle difference is that find_table_device for the tables_devices list will only match devices with the same name and mode. This is because we don't want to upgrade a device's mode in the active table when an inactive table is loaded. Access to the mapped_device structure's tables_devices list requires a mutex (tables_devices_lock), so that tables cannot be created and destroyed concurrently. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2014-08-13 18:53:43 +00:00
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
struct dm_dev **result);
void dm_put_table_device(struct mapped_device *md, struct dm_dev *d);
struct dm_stats *dm_get_stats(struct mapped_device *md);
int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie);
void dm_internal_suspend(struct mapped_device *md);
void dm_internal_resume(struct mapped_device *md);
int dm_io_init(void);
void dm_io_exit(void);
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);
/*
* Mempool operations
*/
struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
void dm_free_md_mempools(struct dm_md_mempools *pools);
/*
* Helpers that are used by DM core
*/
unsigned dm_get_reserved_bio_based_ios(void);
unsigned dm_get_reserved_rq_based_ios(void);
static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
{
return !maxlen || strlen(result) + 1 >= maxlen;
}
#endif