linux/drivers/block/xen-blkback/common.h
Roger Pau Monne ef75341133 xen-blkback: fix memory leaks
I've at least identified two possible memory leaks in blkback, both
related to the shutdown path of a VBD:

- blkback doesn't wait for any pending purge work to finish before
  cleaning the list of free_pages. The purge work will call
  put_free_pages and thus we might end up with pages being added to
  the free_pages list after we have emptied it. Fix this by making
  sure there's no pending purge work before exiting
  xen_blkif_schedule, and moving the free_page cleanup code to
  xen_blkif_free.
- blkback doesn't wait for pending requests to end before cleaning
  persistent grants and the list of free_pages. Again this can add
  pages to the free_pages list or persistent grants to the
  persistent_gnts red-black tree. Fixed by moving the persistent
  grants and free_pages cleanup code to xen_blkif_free.

Also, add some checks in xen_blkif_free to make sure we are cleaning
everything.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Tested-by: Matt Rushton <mrushton@amazon.com>
Reviewed-by: Matt Rushton <mrushton@amazon.com>
Cc: Matt Wilson <msw@amazon.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
2014-02-07 12:58:46 -05:00

485 lines
16 KiB
C

/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
#define __XEN_BLKIF__BACKEND__COMMON_H__
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/io.h>
#include <linux/rbtree.h>
#include <asm/setup.h>
#include <asm/pgalloc.h>
#include <asm/hypervisor.h>
#include <xen/grant_table.h>
#include <xen/xenbus.h>
#include <xen/interface/io/ring.h>
#include <xen/interface/io/blkif.h>
#include <xen/interface/io/protocols.h>
#define DRV_PFX "xen-blkback:"
#define DPRINTK(fmt, args...) \
pr_debug(DRV_PFX "(%s:%d) " fmt ".\n", \
__func__, __LINE__, ##args)
/*
* This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend.
*/
#define MAX_INDIRECT_SEGMENTS 256
#define SEGS_PER_INDIRECT_FRAME \
(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
#define MAX_INDIRECT_PAGES \
((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
#define INDIRECT_PAGES(_segs) \
((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
/* Not a real protocol. Used to generate ring structs which contain
* the elements common to all protocols only. This way we get a
* compiler-checkable way to use common struct elements, so we can
* avoid using switch(protocol) in a number of places. */
struct blkif_common_request {
char dummy;
};
struct blkif_common_response {
char dummy;
};
struct blkif_x86_32_request_rw {
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
} __attribute__((__packed__));
struct blkif_x86_32_request_discard {
uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
blkif_vdev_t _pad1; /* was "handle" for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
uint64_t nr_sectors;
} __attribute__((__packed__));
struct blkif_x86_32_request_other {
uint8_t _pad1;
blkif_vdev_t _pad2;
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
struct blkif_x86_32_request_indirect {
uint8_t indirect_op;
uint16_t nr_segments;
uint64_t id;
blkif_sector_t sector_number;
blkif_vdev_t handle;
uint16_t _pad1;
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
/*
* The maximum number of indirect segments (and pages) that will
* be used is determined by MAX_INDIRECT_SEGMENTS, this value
* is also exported to the guest (via xenstore
* feature-max-indirect-segments entry), so the frontend knows how
* many indirect segments the backend supports.
*/
uint64_t _pad2; /* make it 64 byte aligned */
} __attribute__((__packed__));
struct blkif_x86_32_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_32_request_rw rw;
struct blkif_x86_32_request_discard discard;
struct blkif_x86_32_request_other other;
struct blkif_x86_32_request_indirect indirect;
} u;
} __attribute__((__packed__));
/* i386 protocol version */
#pragma pack(push, 4)
struct blkif_x86_32_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
#pragma pack(pop)
/* x86_64 protocol version */
struct blkif_x86_64_request_rw {
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */
uint64_t id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
} __attribute__((__packed__));
struct blkif_x86_64_request_discard {
uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
blkif_vdev_t _pad1; /* was "handle" for read/write requests */
uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */
uint64_t id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
uint64_t nr_sectors;
} __attribute__((__packed__));
struct blkif_x86_64_request_other {
uint8_t _pad1;
blkif_vdev_t _pad2;
uint32_t _pad3; /* offsetof(blkif_..,u.discard.id)==8 */
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
struct blkif_x86_64_request_indirect {
uint8_t indirect_op;
uint16_t nr_segments;
uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
uint64_t id;
blkif_sector_t sector_number;
blkif_vdev_t handle;
uint16_t _pad2;
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
/*
* The maximum number of indirect segments (and pages) that will
* be used is determined by MAX_INDIRECT_SEGMENTS, this value
* is also exported to the guest (via xenstore
* feature-max-indirect-segments entry), so the frontend knows how
* many indirect segments the backend supports.
*/
uint32_t _pad3; /* make it 64 byte aligned */
} __attribute__((__packed__));
struct blkif_x86_64_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_64_request_rw rw;
struct blkif_x86_64_request_discard discard;
struct blkif_x86_64_request_other other;
struct blkif_x86_64_request_indirect indirect;
} u;
} __attribute__((__packed__));
struct blkif_x86_64_response {
uint64_t __attribute__((__aligned__(8))) id;
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
struct blkif_common_response);
DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
struct blkif_x86_32_response);
DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
struct blkif_x86_64_response);
union blkif_back_rings {
struct blkif_back_ring native;
struct blkif_common_back_ring common;
struct blkif_x86_32_back_ring x86_32;
struct blkif_x86_64_back_ring x86_64;
};
enum blkif_protocol {
BLKIF_PROTOCOL_NATIVE = 1,
BLKIF_PROTOCOL_X86_32 = 2,
BLKIF_PROTOCOL_X86_64 = 3,
};
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
/* Non-zero -> read-only */
unsigned char readonly;
/* VDISK_xxx */
unsigned char type;
/* phys device that this vbd maps to. */
u32 pdevice;
struct block_device *bdev;
/* Cached size parameter. */
sector_t size;
unsigned int flush_support:1;
unsigned int discard_secure:1;
unsigned int feature_gnt_persistent:1;
unsigned int overflow_max_grants:1;
};
struct backend_info;
/* Number of available flags */
#define PERSISTENT_GNT_FLAGS_SIZE 2
/* This persistent grant is currently in use */
#define PERSISTENT_GNT_ACTIVE 0
/*
* This persistent grant has been used, this flag is set when we remove the
* PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
*/
#define PERSISTENT_GNT_WAS_ACTIVE 1
/* Number of requests that we can fit in a ring */
#define XEN_BLKIF_REQS 32
struct persistent_gnt {
struct page *page;
grant_ref_t gnt;
grant_handle_t handle;
DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
struct rb_node node;
struct list_head remove_node;
};
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
unsigned int handle;
/* Physical parameters of the comms window. */
unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings;
void *blk_ring;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
/* Private fields. */
spinlock_t blk_ring_lock;
atomic_t refcnt;
wait_queue_head_t wq;
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
/* One thread per one blkif. */
struct task_struct *xenblkd;
unsigned int waiting_reqs;
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
atomic_t persistent_gnt_in_use;
unsigned long next_lru;
/* used by the kworker that offload work from the persistent purge */
struct list_head persistent_purge_list;
struct work_struct persistent_purge_work;
/* buffer of free pages to map grant refs */
spinlock_t free_pages_lock;
int free_pages_num;
struct list_head free_pages;
/* List of all 'pending_req' available */
struct list_head pending_free;
/* And its spinlock. */
spinlock_t pending_free_lock;
wait_queue_head_t pending_free_wq;
/* statistics */
unsigned long st_print;
unsigned long long st_rd_req;
unsigned long long st_wr_req;
unsigned long long st_oo_req;
unsigned long long st_f_req;
unsigned long long st_ds_req;
unsigned long long st_rd_sect;
unsigned long long st_wr_sect;
wait_queue_head_t waiting_to_free;
/* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq;
};
struct seg_buf {
unsigned long offset;
unsigned int nsec;
};
struct grant_page {
struct page *page;
struct persistent_gnt *persistent_gnt;
grant_handle_t handle;
grant_ref_t gref;
};
/*
* Each outstanding request that we've passed to the lower device layers has a
* 'pending_req' allocated to it. Each buffer_head that completes decrements
* the pendcnt towards zero. When it hits zero, the specified domain has a
* response queued for it, with the saved 'id' passed back.
*/
struct pending_req {
struct xen_blkif *blkif;
u64 id;
int nr_pages;
atomic_t pendcnt;
unsigned short operation;
int status;
struct list_head free_list;
struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
/* Indirect descriptors */
struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
struct bio *biolist[MAX_INDIRECT_SEGMENTS];
};
#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
(_v)->bdev->bd_part->nr_sects : \
get_capacity((_v)->bdev->bd_disk))
#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
#define xen_blkif_put(_b) \
do { \
if (atomic_dec_and_test(&(_b)->refcnt)) \
wake_up(&(_b)->waiting_to_free);\
} while (0)
struct phys_req {
unsigned short dev;
blkif_sector_t nr_sects;
struct block_device *bdev;
blkif_sector_t sector_number;
};
int xen_blkif_interface_init(void);
int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg);
int xen_blkif_purge_persistent(void *arg);
void xen_blkbk_free_caches(struct xen_blkif *blkif);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
int xen_blkbk_barrier(struct xenbus_transaction xbt,
struct backend_info *be, int state);
struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
struct blkif_x86_32_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
dst->u.rw.nr_segments = src->u.rw.nr_segments;
dst->u.rw.handle = src->u.rw.handle;
dst->u.rw.id = src->u.rw.id;
dst->u.rw.sector_number = src->u.rw.sector_number;
barrier();
if (n > dst->u.rw.nr_segments)
n = dst->u.rw.nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->u.rw.seg[i];
break;
case BLKIF_OP_DISCARD:
dst->u.discard.flag = src->u.discard.flag;
dst->u.discard.id = src->u.discard.id;
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
case BLKIF_OP_INDIRECT:
dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
dst->u.indirect.handle = src->u.indirect.handle;
dst->u.indirect.id = src->u.indirect.id;
dst->u.indirect.sector_number = src->u.indirect.sector_number;
barrier();
j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
for (i = 0; i < j; i++)
dst->u.indirect.indirect_grefs[i] =
src->u.indirect.indirect_grefs[i];
break;
default:
/*
* Don't know how to translate this op. Only get the
* ID so failure can be reported to the frontend.
*/
dst->u.other.id = src->u.other.id;
break;
}
}
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
struct blkif_x86_64_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
dst->u.rw.nr_segments = src->u.rw.nr_segments;
dst->u.rw.handle = src->u.rw.handle;
dst->u.rw.id = src->u.rw.id;
dst->u.rw.sector_number = src->u.rw.sector_number;
barrier();
if (n > dst->u.rw.nr_segments)
n = dst->u.rw.nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->u.rw.seg[i];
break;
case BLKIF_OP_DISCARD:
dst->u.discard.flag = src->u.discard.flag;
dst->u.discard.id = src->u.discard.id;
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
case BLKIF_OP_INDIRECT:
dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
dst->u.indirect.handle = src->u.indirect.handle;
dst->u.indirect.id = src->u.indirect.id;
dst->u.indirect.sector_number = src->u.indirect.sector_number;
barrier();
j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
for (i = 0; i < j; i++)
dst->u.indirect.indirect_grefs[i] =
src->u.indirect.indirect_grefs[i];
break;
default:
/*
* Don't know how to translate this op. Only get the
* ID so failure can be reported to the frontend.
*/
dst->u.other.id = src->u.other.id;
break;
}
}
#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */