mirror of
https://github.com/torvalds/linux.git
synced 2024-11-22 12:11:40 +00:00
Merge branch 'for-3.11/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver bits from Jens Axboe: "As I mentioned in the core block pull request, due to real life circumstances the driver pull request would be late. Now it looks like -rc2 late... On the plus side, apart form the rsxx update, these are all things that I could argue could go in later in the cycle as they are fixes and not features. So even though things are late, it's not ALL bad. The pull request contains: - Updates to bcache, all bug fixes, from Kent. - A pile of drbd bug fixes (no big features this time!). - xen blk front/back fixes. - rsxx driver updates, some of them deferred form 3.10. So should be well cooked by now" * 'for-3.11/drivers' of git://git.kernel.dk/linux-block: (63 commits) bcache: Allocation kthread fixes bcache: Fix GC_SECTORS_USED() calculation bcache: Journal replay fix bcache: Shutdown fix bcache: Fix a sysfs splat on shutdown bcache: Advertise that flushes are supported bcache: check for allocation failures bcache: Fix a dumb race bcache: Use standard utility code bcache: Update email address bcache: Delete fuzz tester bcache: Document shrinker reserve better bcache: FUA fixes drbd: Allow online change of al-stripes and al-stripe-size drbd: Constants should be UPPERCASE drbd: Ignore the exit code of a fence-peer handler if it returns too late drbd: Fix rcu_read_lock balance on error path drbd: fix error return code in drbd_init() drbd: Do not sleep inside rcu bcache: Refresh usage docs ...
This commit is contained in:
commit
d4c90b1b9f
17
Documentation/ABI/testing/sysfs-driver-xen-blkback
Normal file
17
Documentation/ABI/testing/sysfs-driver-xen-blkback
Normal file
@ -0,0 +1,17 @@
|
||||
What: /sys/module/xen_blkback/parameters/max_buffer_pages
|
||||
Date: March 2013
|
||||
KernelVersion: 3.11
|
||||
Contact: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Description:
|
||||
Maximum number of free pages to keep in each block
|
||||
backend buffer.
|
||||
|
||||
What: /sys/module/xen_blkback/parameters/max_persistent_grants
|
||||
Date: March 2013
|
||||
KernelVersion: 3.11
|
||||
Contact: Roger Pau Monné <roger.pau@citrix.com>
|
||||
Description:
|
||||
Maximum number of grants to map persistently in
|
||||
blkback. If the frontend tries to use more than
|
||||
max_persistent_grants, the LRU kicks in and starts
|
||||
removing 5% of max_persistent_grants every 100ms.
|
10
Documentation/ABI/testing/sysfs-driver-xen-blkfront
Normal file
10
Documentation/ABI/testing/sysfs-driver-xen-blkfront
Normal file
@ -0,0 +1,10 @@
|
||||
What: /sys/module/xen_blkfront/parameters/max
|
||||
Date: June 2013
|
||||
KernelVersion: 3.11
|
||||
Contact: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
|
||||
Description:
|
||||
Maximum number of segments that the frontend will negotiate
|
||||
with the backend for indirect descriptors. The default value
|
||||
is 32 - higher value means more potential throughput but more
|
||||
memory usage. The backend picks the minimum of the frontend
|
||||
and its default backend value.
|
@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't
|
||||
have to manually attach:
|
||||
make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
|
||||
|
||||
To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
|
||||
bcache-tools now ships udev rules, and bcache devices are known to the kernel
|
||||
immediately. Without udev, you can manually register devices like this:
|
||||
|
||||
echo /dev/sdb > /sys/fs/bcache/register
|
||||
echo /dev/sdc > /sys/fs/bcache/register
|
||||
|
||||
To register your bcache devices automatically, you could add something like
|
||||
this to an init script:
|
||||
Registering the backing device makes the bcache device show up in /dev; you can
|
||||
now format it and use it as normal. But the first time using a new bcache
|
||||
device, it'll be running in passthrough mode until you attach it to a cache.
|
||||
See the section on attaching.
|
||||
|
||||
echo /dev/sd* > /sys/fs/bcache/register_quiet
|
||||
The devices show up as:
|
||||
|
||||
It'll look for bcache superblocks and ignore everything that doesn't have one.
|
||||
/dev/bcache<N>
|
||||
|
||||
Registering the backing device makes the bcache show up in /dev; you can now
|
||||
format it and use it as normal. But the first time using a new bcache device,
|
||||
it'll be running in passthrough mode until you attach it to a cache. See the
|
||||
section on attaching.
|
||||
As well as (with udev):
|
||||
|
||||
The devices show up at /dev/bcacheN, and can be controlled via sysfs from
|
||||
/sys/block/bcacheN/bcache:
|
||||
/dev/bcache/by-uuid/<uuid>
|
||||
/dev/bcache/by-label/<label>
|
||||
|
||||
To get started:
|
||||
|
||||
mkfs.ext4 /dev/bcache0
|
||||
mount /dev/bcache0 /mnt
|
||||
|
||||
You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache .
|
||||
|
||||
Cache devices are managed as sets; multiple caches per set isn't supported yet
|
||||
but will allow for mirroring of metadata and dirty data in the future. Your new
|
||||
cache set shows up as /sys/fs/bcache/<UUID>
|
||||
@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing
|
||||
device to a cache set is done thusly, with the UUID of the cache set in
|
||||
/sys/fs/bcache:
|
||||
|
||||
echo <UUID> > /sys/block/bcache0/bcache/attach
|
||||
echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
|
||||
|
||||
This only has to be done once. The next time you reboot, just reregister all
|
||||
your bcache devices. If a backing device has data in a cache somewhere, the
|
||||
/dev/bcache# device won't be created until the cache shows up - particularly
|
||||
/dev/bcache<N> device won't be created until the cache shows up - particularly
|
||||
important if you have writeback caching turned on.
|
||||
|
||||
If you're booting up and your cache device is gone and never coming back, you
|
||||
@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking.
|
||||
|
||||
SYSFS - BACKING DEVICE:
|
||||
|
||||
Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
|
||||
(if attached) /sys/fs/bcache/<cset-uuid>/bdev*
|
||||
|
||||
attach
|
||||
Echo the UUID of a cache set to this file to enable caching.
|
||||
|
||||
@ -300,6 +307,8 @@ cache_readaheads
|
||||
|
||||
SYSFS - CACHE SET:
|
||||
|
||||
Available at /sys/fs/bcache/<cset-uuid>
|
||||
|
||||
average_key_size
|
||||
Average data per key in the btree.
|
||||
|
||||
@ -390,6 +399,8 @@ trigger_gc
|
||||
|
||||
SYSFS - CACHE DEVICE:
|
||||
|
||||
Available at /sys/block/<cdev>/bcache
|
||||
|
||||
block_size
|
||||
Minimum granularity of writes - should match hardware sector size.
|
||||
|
||||
|
@ -1642,7 +1642,7 @@ S: Maintained
|
||||
F: drivers/net/hamradio/baycom*
|
||||
|
||||
BCACHE (BLOCK LAYER CACHE)
|
||||
M: Kent Overstreet <koverstreet@google.com>
|
||||
M: Kent Overstreet <kmo@daterainc.com>
|
||||
L: linux-bcache@vger.kernel.org
|
||||
W: http://bcache.evilpiepirate.org
|
||||
S: Maintained:
|
||||
@ -3346,7 +3346,7 @@ F: Documentation/firmware_class/
|
||||
F: drivers/base/firmware*.c
|
||||
F: include/linux/firmware.h
|
||||
|
||||
FLASHSYSTEM DRIVER (IBM FlashSystem 70/80 PCI SSD Flash Card)
|
||||
FLASH ADAPTER DRIVER (IBM Flash Adapter 900GB Full Height PCI Flash Card)
|
||||
M: Joshua Morris <josh.h.morris@us.ibm.com>
|
||||
M: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
|
||||
S: Maintained
|
||||
|
@ -532,11 +532,11 @@ config BLK_DEV_RBD
|
||||
If unsure, say N.
|
||||
|
||||
config BLK_DEV_RSXX
|
||||
tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver"
|
||||
tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
|
||||
depends on PCI
|
||||
help
|
||||
Device driver for IBM's high speed PCIe SSD
|
||||
storage devices: FlashSystem-70 and FlashSystem-80.
|
||||
storage device: Flash Adapter 900GB Full Height.
|
||||
|
||||
To compile this driver as a module, choose M here: the
|
||||
module will be called rsxx.
|
||||
|
@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev)
|
||||
wake_up(&mdev->al_wait);
|
||||
}
|
||||
|
||||
int drbd_initialize_al(struct drbd_conf *mdev, void *buffer)
|
||||
{
|
||||
struct al_transaction_on_disk *al = buffer;
|
||||
struct drbd_md *md = &mdev->ldev->md;
|
||||
sector_t al_base = md->md_offset + md->al_offset;
|
||||
int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
|
||||
int i;
|
||||
|
||||
memset(al, 0, 4096);
|
||||
al->magic = cpu_to_be32(DRBD_AL_MAGIC);
|
||||
al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
|
||||
al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
|
||||
|
||||
for (i = 0; i < al_size_4k; i++) {
|
||||
int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int w_update_odbm(struct drbd_work *w, int unused)
|
||||
{
|
||||
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
|
||||
|
@ -832,6 +832,7 @@ struct drbd_tconn { /* is a resource from the config file */
|
||||
unsigned susp_nod:1; /* IO suspended because no data */
|
||||
unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
|
||||
struct mutex cstate_mutex; /* Protects graceful disconnects */
|
||||
unsigned int connect_cnt; /* Inc each time a connection is established */
|
||||
|
||||
unsigned long flags;
|
||||
struct net_conf *net_conf; /* content protected by rcu */
|
||||
@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
|
||||
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
|
||||
|
||||
extern void conn_md_sync(struct drbd_tconn *tconn);
|
||||
extern void drbd_md_write(struct drbd_conf *mdev, void *buffer);
|
||||
extern void drbd_md_sync(struct drbd_conf *mdev);
|
||||
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
|
||||
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
|
||||
@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
|
||||
extern void drbd_resume_io(struct drbd_conf *mdev);
|
||||
extern char *ppsize(char *buf, unsigned long long size);
|
||||
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
|
||||
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
|
||||
extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
|
||||
enum determine_dev_size {
|
||||
DS_ERROR_SHRINK = -3,
|
||||
DS_ERROR_SPACE_MD = -2,
|
||||
DS_ERROR = -1,
|
||||
DS_UNCHANGED = 0,
|
||||
DS_SHRUNK = 1,
|
||||
DS_GREW = 2
|
||||
};
|
||||
extern enum determine_dev_size
|
||||
drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
|
||||
extern void resync_after_online_grow(struct drbd_conf *);
|
||||
extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
|
||||
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
|
||||
@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
|
||||
#define drbd_set_out_of_sync(mdev, sector, size) \
|
||||
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
|
||||
extern void drbd_al_shrink(struct drbd_conf *mdev);
|
||||
extern int drbd_initialize_al(struct drbd_conf *, void *);
|
||||
|
||||
/* drbd_nl.c */
|
||||
/* state info broadcast */
|
||||
|
@ -2762,8 +2762,6 @@ int __init drbd_init(void)
|
||||
/*
|
||||
* allocate all necessary structs
|
||||
*/
|
||||
err = -ENOMEM;
|
||||
|
||||
init_waitqueue_head(&drbd_pp_wait);
|
||||
|
||||
drbd_proc = NULL; /* play safe for drbd_cleanup */
|
||||
@ -2773,6 +2771,7 @@ int __init drbd_init(void)
|
||||
if (err)
|
||||
goto fail;
|
||||
|
||||
err = -ENOMEM;
|
||||
drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
|
||||
if (!drbd_proc) {
|
||||
printk(KERN_ERR "drbd: unable to register proc file\n");
|
||||
@ -2803,7 +2802,6 @@ int __init drbd_init(void)
|
||||
fail:
|
||||
drbd_cleanup();
|
||||
if (err == -ENOMEM)
|
||||
/* currently always the case */
|
||||
printk(KERN_ERR "drbd: ran out of memory\n");
|
||||
else
|
||||
printk(KERN_ERR "drbd: initialization failure\n");
|
||||
@ -2881,34 +2879,14 @@ struct meta_data_on_disk {
|
||||
u8 reserved_u8[4096 - (7*8 + 10*4)];
|
||||
} __packed;
|
||||
|
||||
/**
|
||||
* drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
|
||||
* @mdev: DRBD device.
|
||||
*/
|
||||
void drbd_md_sync(struct drbd_conf *mdev)
|
||||
|
||||
|
||||
void drbd_md_write(struct drbd_conf *mdev, void *b)
|
||||
{
|
||||
struct meta_data_on_disk *buffer;
|
||||
struct meta_data_on_disk *buffer = b;
|
||||
sector_t sector;
|
||||
int i;
|
||||
|
||||
/* Don't accidentally change the DRBD meta data layout. */
|
||||
BUILD_BUG_ON(UI_SIZE != 4);
|
||||
BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
|
||||
|
||||
del_timer(&mdev->md_sync_timer);
|
||||
/* timer may be rearmed by drbd_md_mark_dirty() now. */
|
||||
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
|
||||
return;
|
||||
|
||||
/* We use here D_FAILED and not D_ATTACHING because we try to write
|
||||
* metadata even if we detach due to a disk failure! */
|
||||
if (!get_ldev_if_state(mdev, D_FAILED))
|
||||
return;
|
||||
|
||||
buffer = drbd_md_get_buffer(mdev);
|
||||
if (!buffer)
|
||||
goto out;
|
||||
|
||||
memset(buffer, 0, sizeof(*buffer));
|
||||
|
||||
buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
|
||||
@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev)
|
||||
dev_err(DEV, "meta data update failed!\n");
|
||||
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
|
||||
* @mdev: DRBD device.
|
||||
*/
|
||||
void drbd_md_sync(struct drbd_conf *mdev)
|
||||
{
|
||||
struct meta_data_on_disk *buffer;
|
||||
|
||||
/* Don't accidentally change the DRBD meta data layout. */
|
||||
BUILD_BUG_ON(UI_SIZE != 4);
|
||||
BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
|
||||
|
||||
del_timer(&mdev->md_sync_timer);
|
||||
/* timer may be rearmed by drbd_md_mark_dirty() now. */
|
||||
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
|
||||
return;
|
||||
|
||||
/* We use here D_FAILED and not D_ATTACHING because we try to write
|
||||
* metadata even if we detach due to a disk failure! */
|
||||
if (!get_ldev_if_state(mdev, D_FAILED))
|
||||
return;
|
||||
|
||||
buffer = drbd_md_get_buffer(mdev);
|
||||
if (!buffer)
|
||||
goto out;
|
||||
|
||||
drbd_md_write(mdev, buffer);
|
||||
|
||||
/* Update mdev->ldev->md.la_size_sect,
|
||||
* since we updated it on metadata. */
|
||||
|
@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
|
||||
|
||||
bool conn_try_outdate_peer(struct drbd_tconn *tconn)
|
||||
{
|
||||
unsigned int connect_cnt;
|
||||
union drbd_state mask = { };
|
||||
union drbd_state val = { };
|
||||
enum drbd_fencing_p fp;
|
||||
@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
|
||||
return false;
|
||||
}
|
||||
|
||||
spin_lock_irq(&tconn->req_lock);
|
||||
connect_cnt = tconn->connect_cnt;
|
||||
spin_unlock_irq(&tconn->req_lock);
|
||||
|
||||
fp = highest_fencing_policy(tconn);
|
||||
switch (fp) {
|
||||
case FP_NOT_AVAIL:
|
||||
@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
|
||||
here, because we might were able to re-establish the connection in the
|
||||
meantime. */
|
||||
spin_lock_irq(&tconn->req_lock);
|
||||
if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
|
||||
_conn_request_state(tconn, mask, val, CS_VERBOSE);
|
||||
if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) {
|
||||
if (tconn->connect_cnt != connect_cnt)
|
||||
/* In case the connection was established and droped
|
||||
while the fence-peer handler was running, ignore it */
|
||||
conn_info(tconn, "Ignoring fence-peer exit code\n");
|
||||
else
|
||||
_conn_request_state(tconn, mask, val, CS_VERBOSE);
|
||||
}
|
||||
spin_unlock_irq(&tconn->req_lock);
|
||||
|
||||
return conn_highest_pdsk(tconn) <= D_OUTDATED;
|
||||
@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev)
|
||||
* Returns 0 on success, negative return values indicate errors.
|
||||
* You should call drbd_md_sync() after calling this function.
|
||||
*/
|
||||
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
|
||||
enum determine_dev_size
|
||||
drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
|
||||
{
|
||||
sector_t prev_first_sect, prev_size; /* previous meta location */
|
||||
sector_t la_size_sect, u_size;
|
||||
struct drbd_md *md = &mdev->ldev->md;
|
||||
u32 prev_al_stripe_size_4k;
|
||||
u32 prev_al_stripes;
|
||||
sector_t size;
|
||||
char ppb[10];
|
||||
void *buffer;
|
||||
|
||||
int md_moved, la_size_changed;
|
||||
enum determine_dev_size rv = unchanged;
|
||||
enum determine_dev_size rv = DS_UNCHANGED;
|
||||
|
||||
/* race:
|
||||
* application request passes inc_ap_bio,
|
||||
@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
* still lock the act_log to not trigger ASSERTs there.
|
||||
*/
|
||||
drbd_suspend_io(mdev);
|
||||
buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */
|
||||
if (!buffer) {
|
||||
drbd_resume_io(mdev);
|
||||
return DS_ERROR;
|
||||
}
|
||||
|
||||
/* no wait necessary anymore, actually we could assert that */
|
||||
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
|
||||
@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
prev_size = mdev->ldev->md.md_size_sect;
|
||||
la_size_sect = mdev->ldev->md.la_size_sect;
|
||||
|
||||
/* TODO: should only be some assert here, not (re)init... */
|
||||
if (rs) {
|
||||
/* rs is non NULL if we should change the AL layout only */
|
||||
|
||||
prev_al_stripes = md->al_stripes;
|
||||
prev_al_stripe_size_4k = md->al_stripe_size_4k;
|
||||
|
||||
md->al_stripes = rs->al_stripes;
|
||||
md->al_stripe_size_4k = rs->al_stripe_size / 4;
|
||||
md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
|
||||
}
|
||||
|
||||
drbd_md_set_sector_offsets(mdev, mdev->ldev);
|
||||
|
||||
rcu_read_lock();
|
||||
@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
rcu_read_unlock();
|
||||
size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
|
||||
|
||||
if (size < la_size_sect) {
|
||||
if (rs && u_size == 0) {
|
||||
/* Remove "rs &&" later. This check should always be active, but
|
||||
right now the receiver expects the permissive behavior */
|
||||
dev_warn(DEV, "Implicit shrink not allowed. "
|
||||
"Use --size=%llus for explicit shrink.\n",
|
||||
(unsigned long long)size);
|
||||
rv = DS_ERROR_SHRINK;
|
||||
}
|
||||
if (u_size > size)
|
||||
rv = DS_ERROR_SPACE_MD;
|
||||
if (rv != DS_UNCHANGED)
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (drbd_get_capacity(mdev->this_bdev) != size ||
|
||||
drbd_bm_capacity(mdev) != size) {
|
||||
int err;
|
||||
@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
"Leaving size unchanged at size = %lu KB\n",
|
||||
(unsigned long)size);
|
||||
}
|
||||
rv = dev_size_error;
|
||||
rv = DS_ERROR;
|
||||
}
|
||||
/* racy, see comments above. */
|
||||
drbd_set_my_capacity(mdev, size);
|
||||
@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
|
||||
dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
|
||||
(unsigned long long)size>>1);
|
||||
}
|
||||
if (rv == dev_size_error)
|
||||
goto out;
|
||||
if (rv <= DS_ERROR)
|
||||
goto err_out;
|
||||
|
||||
la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
|
||||
|
||||
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|
||||
|| prev_size != mdev->ldev->md.md_size_sect;
|
||||
|
||||
if (la_size_changed || md_moved) {
|
||||
int err;
|
||||
if (la_size_changed || md_moved || rs) {
|
||||
u32 prev_flags;
|
||||
|
||||
drbd_al_shrink(mdev); /* All extents inactive. */
|
||||
|
||||
prev_flags = md->flags;
|
||||
md->flags &= ~MDF_PRIMARY_IND;
|
||||
drbd_md_write(mdev, buffer);
|
||||
|
||||
dev_info(DEV, "Writing the whole bitmap, %s\n",
|
||||
la_size_changed && md_moved ? "size changed and md moved" :
|
||||
la_size_changed ? "size changed" : "md moved");
|
||||
/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
|
||||
err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
|
||||
"size changed", BM_LOCKED_MASK);
|
||||
if (err) {
|
||||
rv = dev_size_error;
|
||||
goto out;
|
||||
}
|
||||
drbd_md_mark_dirty(mdev);
|
||||
drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
|
||||
"size changed", BM_LOCKED_MASK);
|
||||
drbd_initialize_al(mdev, buffer);
|
||||
|
||||
md->flags = prev_flags;
|
||||
drbd_md_write(mdev, buffer);
|
||||
|
||||
if (rs)
|
||||
dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
|
||||
md->al_stripes, md->al_stripe_size_4k * 4);
|
||||
}
|
||||
|
||||
if (size > la_size_sect)
|
||||
rv = grew;
|
||||
rv = DS_GREW;
|
||||
if (size < la_size_sect)
|
||||
rv = shrunk;
|
||||
out:
|
||||
rv = DS_SHRUNK;
|
||||
|
||||
if (0) {
|
||||
err_out:
|
||||
if (rs) {
|
||||
md->al_stripes = prev_al_stripes;
|
||||
md->al_stripe_size_4k = prev_al_stripe_size_4k;
|
||||
md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
|
||||
|
||||
drbd_md_set_sector_offsets(mdev, mdev->ldev);
|
||||
}
|
||||
}
|
||||
lc_unlock(mdev->act_log);
|
||||
wake_up(&mdev->al_wait);
|
||||
drbd_md_put_buffer(mdev);
|
||||
drbd_resume_io(mdev);
|
||||
|
||||
return rv;
|
||||
@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
|
||||
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
|
||||
set_bit(USE_DEGR_WFC_T, &mdev->flags);
|
||||
|
||||
dd = drbd_determine_dev_size(mdev, 0);
|
||||
if (dd == dev_size_error) {
|
||||
dd = drbd_determine_dev_size(mdev, 0, NULL);
|
||||
if (dd <= DS_ERROR) {
|
||||
retcode = ERR_NOMEM_BITMAP;
|
||||
goto force_diskless_dec;
|
||||
} else if (dd == grew)
|
||||
} else if (dd == DS_GREW)
|
||||
set_bit(RESYNC_AFTER_NEG, &mdev->flags);
|
||||
|
||||
if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
|
||||
@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
|
||||
struct drbd_conf *mdev;
|
||||
enum drbd_ret_code retcode;
|
||||
enum determine_dev_size dd;
|
||||
bool change_al_layout = false;
|
||||
enum dds_flags ddsf;
|
||||
sector_t u_size;
|
||||
int err;
|
||||
@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
|
||||
if (retcode != NO_ERROR)
|
||||
goto fail;
|
||||
|
||||
mdev = adm_ctx.mdev;
|
||||
if (!get_ldev(mdev)) {
|
||||
retcode = ERR_NO_DISK;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
memset(&rs, 0, sizeof(struct resize_parms));
|
||||
rs.al_stripes = mdev->ldev->md.al_stripes;
|
||||
rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4;
|
||||
if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
|
||||
err = resize_parms_from_attrs(&rs, info);
|
||||
if (err) {
|
||||
retcode = ERR_MANDATORY_TAG;
|
||||
drbd_msg_put_info(from_attrs_err_to_txt(err));
|
||||
goto fail;
|
||||
goto fail_ldev;
|
||||
}
|
||||
}
|
||||
|
||||
mdev = adm_ctx.mdev;
|
||||
if (mdev->state.conn > C_CONNECTED) {
|
||||
retcode = ERR_RESIZE_RESYNC;
|
||||
goto fail;
|
||||
goto fail_ldev;
|
||||
}
|
||||
|
||||
if (mdev->state.role == R_SECONDARY &&
|
||||
mdev->state.peer == R_SECONDARY) {
|
||||
retcode = ERR_NO_PRIMARY;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!get_ldev(mdev)) {
|
||||
retcode = ERR_NO_DISK;
|
||||
goto fail;
|
||||
goto fail_ldev;
|
||||
}
|
||||
|
||||
if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
|
||||
@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
|
||||
}
|
||||
}
|
||||
|
||||
if (mdev->ldev->md.al_stripes != rs.al_stripes ||
|
||||
mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
|
||||
u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
|
||||
|
||||
if (al_size_k > (16 * 1024 * 1024)) {
|
||||
retcode = ERR_MD_LAYOUT_TOO_BIG;
|
||||
goto fail_ldev;
|
||||
}
|
||||
|
||||
if (al_size_k < MD_32kB_SECT/2) {
|
||||
retcode = ERR_MD_LAYOUT_TOO_SMALL;
|
||||
goto fail_ldev;
|
||||
}
|
||||
|
||||
if (mdev->state.conn != C_CONNECTED) {
|
||||
retcode = ERR_MD_LAYOUT_CONNECTED;
|
||||
goto fail_ldev;
|
||||
}
|
||||
|
||||
change_al_layout = true;
|
||||
}
|
||||
|
||||
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
|
||||
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
|
||||
|
||||
@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
|
||||
}
|
||||
|
||||
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
|
||||
dd = drbd_determine_dev_size(mdev, ddsf);
|
||||
dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL);
|
||||
drbd_md_sync(mdev);
|
||||
put_ldev(mdev);
|
||||
if (dd == dev_size_error) {
|
||||
if (dd == DS_ERROR) {
|
||||
retcode = ERR_NOMEM_BITMAP;
|
||||
goto fail;
|
||||
} else if (dd == DS_ERROR_SPACE_MD) {
|
||||
retcode = ERR_MD_LAYOUT_NO_FIT;
|
||||
goto fail;
|
||||
} else if (dd == DS_ERROR_SHRINK) {
|
||||
retcode = ERR_IMPLICIT_SHRINK;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (mdev->state.conn == C_CONNECTED) {
|
||||
if (dd == grew)
|
||||
if (dd == DS_GREW)
|
||||
set_bit(RESIZE_PENDING, &mdev->flags);
|
||||
|
||||
drbd_send_uuids(mdev);
|
||||
@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
|
||||
const struct sib_info *sib)
|
||||
{
|
||||
struct state_info *si = NULL; /* for sizeof(si->member); */
|
||||
struct net_conf *nc;
|
||||
struct nlattr *nla;
|
||||
int got_ldev;
|
||||
int err = 0;
|
||||
@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
|
||||
goto nla_put_failure;
|
||||
|
||||
rcu_read_lock();
|
||||
if (got_ldev)
|
||||
if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
|
||||
goto nla_put_failure;
|
||||
if (got_ldev) {
|
||||
struct disk_conf *disk_conf;
|
||||
|
||||
nc = rcu_dereference(mdev->tconn->net_conf);
|
||||
if (nc)
|
||||
err = net_conf_to_skb(skb, nc, exclude_sensitive);
|
||||
disk_conf = rcu_dereference(mdev->ldev->disk_conf);
|
||||
err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
|
||||
}
|
||||
if (!err) {
|
||||
struct net_conf *nc;
|
||||
|
||||
nc = rcu_dereference(mdev->tconn->net_conf);
|
||||
if (nc)
|
||||
err = net_conf_to_skb(skb, nc, exclude_sensitive);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (err)
|
||||
goto nla_put_failure;
|
||||
|
@ -1039,6 +1039,8 @@ randomize:
|
||||
rcu_read_lock();
|
||||
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
|
||||
kref_get(&mdev->kref);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Prevent a race between resync-handshake and
|
||||
* being promoted to Primary.
|
||||
*
|
||||
@ -1049,8 +1051,6 @@ randomize:
|
||||
mutex_lock(mdev->state_mutex);
|
||||
mutex_unlock(mdev->state_mutex);
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
if (discard_my_data)
|
||||
set_bit(DISCARD_MY_DATA, &mdev->flags);
|
||||
else
|
||||
@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
|
||||
{
|
||||
struct drbd_conf *mdev;
|
||||
struct p_sizes *p = pi->data;
|
||||
enum determine_dev_size dd = unchanged;
|
||||
enum determine_dev_size dd = DS_UNCHANGED;
|
||||
sector_t p_size, p_usize, my_usize;
|
||||
int ldsc = 0; /* local disk size changed */
|
||||
enum dds_flags ddsf;
|
||||
@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
|
||||
|
||||
ddsf = be16_to_cpu(p->dds_flags);
|
||||
if (get_ldev(mdev)) {
|
||||
dd = drbd_determine_dev_size(mdev, ddsf);
|
||||
dd = drbd_determine_dev_size(mdev, ddsf, NULL);
|
||||
put_ldev(mdev);
|
||||
if (dd == dev_size_error)
|
||||
if (dd == DS_ERROR)
|
||||
return -EIO;
|
||||
drbd_md_sync(mdev);
|
||||
} else {
|
||||
@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
|
||||
drbd_send_sizes(mdev, 0, ddsf);
|
||||
}
|
||||
if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
|
||||
(dd == grew && mdev->state.conn == C_CONNECTED)) {
|
||||
(dd == DS_GREW && mdev->state.conn == C_CONNECTED)) {
|
||||
if (mdev->state.pdsk >= D_INCONSISTENT &&
|
||||
mdev->state.disk >= D_INCONSISTENT) {
|
||||
if (ddsf & DDSF_NO_RESYNC)
|
||||
|
@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
|
||||
drbd_thread_restart_nowait(&mdev->tconn->receiver);
|
||||
|
||||
/* Resume AL writing if we get a connection */
|
||||
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
|
||||
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
|
||||
drbd_resume_al(mdev);
|
||||
mdev->tconn->connect_cnt++;
|
||||
}
|
||||
|
||||
/* remember last attach time so request_timer_fn() won't
|
||||
* kill newly established sessions while we are still trying to thaw
|
||||
|
@ -31,6 +31,8 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
#include <linux/genhd.h>
|
||||
#include <linux/idr.h>
|
||||
@ -39,8 +41,9 @@
|
||||
#include "rsxx_cfg.h"
|
||||
|
||||
#define NO_LEGACY 0
|
||||
#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
|
||||
|
||||
MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver");
|
||||
MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
|
||||
MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_VERSION(DRIVER_VERSION);
|
||||
@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY;
|
||||
module_param(force_legacy, uint, 0444);
|
||||
MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
|
||||
|
||||
static unsigned int sync_start = 1;
|
||||
module_param(sync_start, uint, 0444);
|
||||
MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
|
||||
"until the card startup has completed.");
|
||||
|
||||
static DEFINE_IDA(rsxx_disk_ida);
|
||||
static DEFINE_SPINLOCK(rsxx_ida_lock);
|
||||
|
||||
/* --------------------Debugfs Setup ------------------- */
|
||||
|
||||
struct rsxx_cram {
|
||||
u32 f_pos;
|
||||
u32 offset;
|
||||
void *i_private;
|
||||
};
|
||||
|
||||
static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct rsxx_cardinfo *card = m->private;
|
||||
|
||||
seq_printf(m, "HWID 0x%08x\n",
|
||||
ioread32(card->regmap + HWID));
|
||||
seq_printf(m, "SCRATCH 0x%08x\n",
|
||||
ioread32(card->regmap + SCRATCH));
|
||||
seq_printf(m, "IER 0x%08x\n",
|
||||
ioread32(card->regmap + IER));
|
||||
seq_printf(m, "IPR 0x%08x\n",
|
||||
ioread32(card->regmap + IPR));
|
||||
seq_printf(m, "CREG_CMD 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_CMD));
|
||||
seq_printf(m, "CREG_ADD 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_ADD));
|
||||
seq_printf(m, "CREG_CNT 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_CNT));
|
||||
seq_printf(m, "CREG_STAT 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_STAT));
|
||||
seq_printf(m, "CREG_DATA0 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA0));
|
||||
seq_printf(m, "CREG_DATA1 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA1));
|
||||
seq_printf(m, "CREG_DATA2 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA2));
|
||||
seq_printf(m, "CREG_DATA3 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA3));
|
||||
seq_printf(m, "CREG_DATA4 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA4));
|
||||
seq_printf(m, "CREG_DATA5 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA5));
|
||||
seq_printf(m, "CREG_DATA6 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA6));
|
||||
seq_printf(m, "CREG_DATA7 0x%08x\n",
|
||||
ioread32(card->regmap + CREG_DATA7));
|
||||
seq_printf(m, "INTR_COAL 0x%08x\n",
|
||||
ioread32(card->regmap + INTR_COAL));
|
||||
seq_printf(m, "HW_ERROR 0x%08x\n",
|
||||
ioread32(card->regmap + HW_ERROR));
|
||||
seq_printf(m, "DEBUG0 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG0));
|
||||
seq_printf(m, "DEBUG1 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG1));
|
||||
seq_printf(m, "DEBUG2 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG2));
|
||||
seq_printf(m, "DEBUG3 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG3));
|
||||
seq_printf(m, "DEBUG4 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG4));
|
||||
seq_printf(m, "DEBUG5 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG5));
|
||||
seq_printf(m, "DEBUG6 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG6));
|
||||
seq_printf(m, "DEBUG7 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_DEBUG7));
|
||||
seq_printf(m, "RECONFIG 0x%08x\n",
|
||||
ioread32(card->regmap + PCI_RECONFIG));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rsxx_attr_stats_show(struct seq_file *m, void *p)
|
||||
{
|
||||
struct rsxx_cardinfo *card = m->private;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
seq_printf(m, "Ctrl %d CRC Errors = %d\n",
|
||||
i, card->ctrl[i].stats.crc_errors);
|
||||
seq_printf(m, "Ctrl %d Hard Errors = %d\n",
|
||||
i, card->ctrl[i].stats.hard_errors);
|
||||
seq_printf(m, "Ctrl %d Soft Errors = %d\n",
|
||||
i, card->ctrl[i].stats.soft_errors);
|
||||
seq_printf(m, "Ctrl %d Writes Issued = %d\n",
|
||||
i, card->ctrl[i].stats.writes_issued);
|
||||
seq_printf(m, "Ctrl %d Writes Failed = %d\n",
|
||||
i, card->ctrl[i].stats.writes_failed);
|
||||
seq_printf(m, "Ctrl %d Reads Issued = %d\n",
|
||||
i, card->ctrl[i].stats.reads_issued);
|
||||
seq_printf(m, "Ctrl %d Reads Failed = %d\n",
|
||||
i, card->ctrl[i].stats.reads_failed);
|
||||
seq_printf(m, "Ctrl %d Reads Retried = %d\n",
|
||||
i, card->ctrl[i].stats.reads_retried);
|
||||
seq_printf(m, "Ctrl %d Discards Issued = %d\n",
|
||||
i, card->ctrl[i].stats.discards_issued);
|
||||
seq_printf(m, "Ctrl %d Discards Failed = %d\n",
|
||||
i, card->ctrl[i].stats.discards_failed);
|
||||
seq_printf(m, "Ctrl %d DMA SW Errors = %d\n",
|
||||
i, card->ctrl[i].stats.dma_sw_err);
|
||||
seq_printf(m, "Ctrl %d DMA HW Faults = %d\n",
|
||||
i, card->ctrl[i].stats.dma_hw_fault);
|
||||
seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n",
|
||||
i, card->ctrl[i].stats.dma_cancelled);
|
||||
seq_printf(m, "Ctrl %d SW Queue Depth = %d\n",
|
||||
i, card->ctrl[i].stats.sw_q_depth);
|
||||
seq_printf(m, "Ctrl %d HW Queue Depth = %d\n",
|
||||
i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, rsxx_attr_stats_show, inode->i_private);
|
||||
}
|
||||
|
||||
static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
|
||||
}
|
||||
|
||||
static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
struct rsxx_cram *info = fp->private_data;
|
||||
struct rsxx_cardinfo *card = info->i_private;
|
||||
char *buf;
|
||||
int st;
|
||||
|
||||
buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
info->f_pos = (u32)*ppos + info->offset;
|
||||
|
||||
st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
|
||||
if (st)
|
||||
return st;
|
||||
|
||||
st = copy_to_user(ubuf, buf, cnt);
|
||||
if (st)
|
||||
return st;
|
||||
|
||||
info->offset += cnt;
|
||||
|
||||
kfree(buf);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
struct rsxx_cram *info = fp->private_data;
|
||||
struct rsxx_cardinfo *card = info->i_private;
|
||||
char *buf;
|
||||
int st;
|
||||
|
||||
buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
st = copy_from_user(buf, ubuf, cnt);
|
||||
if (st)
|
||||
return st;
|
||||
|
||||
info->f_pos = (u32)*ppos + info->offset;
|
||||
|
||||
st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
|
||||
if (st)
|
||||
return st;
|
||||
|
||||
info->offset += cnt;
|
||||
|
||||
kfree(buf);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static int rsxx_cram_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL);
|
||||
if (!info)
|
||||
return -ENOMEM;
|
||||
|
||||
info->i_private = inode->i_private;
|
||||
info->f_pos = file->f_pos;
|
||||
file->private_data = info;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int rsxx_cram_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct rsxx_cram *info = file->private_data;
|
||||
|
||||
if (!info)
|
||||
return 0;
|
||||
|
||||
kfree(info);
|
||||
file->private_data = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations debugfs_cram_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = rsxx_cram_open,
|
||||
.read = rsxx_cram_read,
|
||||
.write = rsxx_cram_write,
|
||||
.release = rsxx_cram_release,
|
||||
};
|
||||
|
||||
static const struct file_operations debugfs_stats_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = rsxx_attr_stats_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static const struct file_operations debugfs_pci_regs_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = rsxx_attr_pci_regs_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
|
||||
{
|
||||
struct dentry *debugfs_stats;
|
||||
struct dentry *debugfs_pci_regs;
|
||||
struct dentry *debugfs_cram;
|
||||
|
||||
card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
|
||||
if (IS_ERR_OR_NULL(card->debugfs_dir))
|
||||
goto failed_debugfs_dir;
|
||||
|
||||
debugfs_stats = debugfs_create_file("stats", S_IRUGO,
|
||||
card->debugfs_dir, card,
|
||||
&debugfs_stats_fops);
|
||||
if (IS_ERR_OR_NULL(debugfs_stats))
|
||||
goto failed_debugfs_stats;
|
||||
|
||||
debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
|
||||
card->debugfs_dir, card,
|
||||
&debugfs_pci_regs_fops);
|
||||
if (IS_ERR_OR_NULL(debugfs_pci_regs))
|
||||
goto failed_debugfs_pci_regs;
|
||||
|
||||
debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
|
||||
card->debugfs_dir, card,
|
||||
&debugfs_cram_fops);
|
||||
if (IS_ERR_OR_NULL(debugfs_cram))
|
||||
goto failed_debugfs_cram;
|
||||
|
||||
return;
|
||||
failed_debugfs_cram:
|
||||
debugfs_remove(debugfs_pci_regs);
|
||||
failed_debugfs_pci_regs:
|
||||
debugfs_remove(debugfs_stats);
|
||||
failed_debugfs_stats:
|
||||
debugfs_remove(card->debugfs_dir);
|
||||
failed_debugfs_dir:
|
||||
card->debugfs_dir = NULL;
|
||||
}
|
||||
|
||||
/*----------------- Interrupt Control & Handling -------------------*/
|
||||
|
||||
static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
|
||||
@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata)
|
||||
}
|
||||
|
||||
if (isr & CR_INTR_CREG) {
|
||||
schedule_work(&card->creg_ctrl.done_work);
|
||||
queue_work(card->creg_ctrl.creg_wq,
|
||||
&card->creg_ctrl.done_work);
|
||||
handled++;
|
||||
}
|
||||
|
||||
if (isr & CR_INTR_EVENT) {
|
||||
schedule_work(&card->event_work);
|
||||
queue_work(card->event_wq, &card->event_work);
|
||||
rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
|
||||
handled++;
|
||||
}
|
||||
@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
|
||||
int i;
|
||||
int st;
|
||||
|
||||
dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n");
|
||||
dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
|
||||
|
||||
card->eeh_state = 1;
|
||||
rsxx_mask_interrupts(card);
|
||||
@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
|
||||
{
|
||||
struct rsxx_cardinfo *card = pci_get_drvdata(dev);
|
||||
int i;
|
||||
int cnt = 0;
|
||||
|
||||
dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n");
|
||||
dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
|
||||
|
||||
card->eeh_state = 1;
|
||||
card->halt = 1;
|
||||
|
||||
for (i = 0; i < card->n_targets; i++)
|
||||
del_timer_sync(&card->ctrl[i].activity_timer);
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
spin_lock_bh(&card->ctrl[i].queue_lock);
|
||||
cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
|
||||
&card->ctrl[i].queue);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
|
||||
rsxx_eeh_cancel_dmas(card);
|
||||
cnt += rsxx_dma_cancel(&card->ctrl[i]);
|
||||
|
||||
if (cnt)
|
||||
dev_info(CARD_TO_DEV(card),
|
||||
"Freed %d queued DMAs on channel %d\n",
|
||||
cnt, card->ctrl[i].id);
|
||||
}
|
||||
}
|
||||
|
||||
static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
|
||||
@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
|
||||
int st;
|
||||
|
||||
dev_warn(&dev->dev,
|
||||
"IBM FlashSystem PCI: recovering from slot reset.\n");
|
||||
"IBM Flash Adapter PCI: recovering from slot reset.\n");
|
||||
|
||||
st = pci_enable_device(dev);
|
||||
if (st)
|
||||
@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
|
||||
&card->ctrl[i].issue_dma_work);
|
||||
}
|
||||
|
||||
dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n");
|
||||
dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
|
||||
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
|
||||
@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev,
|
||||
{
|
||||
struct rsxx_cardinfo *card;
|
||||
int st;
|
||||
unsigned int sync_timeout;
|
||||
|
||||
dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
|
||||
|
||||
@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev,
|
||||
}
|
||||
|
||||
/************* Setup Processor Command Interface *************/
|
||||
rsxx_creg_setup(card);
|
||||
st = rsxx_creg_setup(card);
|
||||
if (st) {
|
||||
dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
|
||||
goto failed_creg_setup;
|
||||
}
|
||||
|
||||
spin_lock_irq(&card->irq_lock);
|
||||
rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
|
||||
@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev,
|
||||
}
|
||||
|
||||
/************* Setup Card Event Handler *************/
|
||||
card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
|
||||
if (!card->event_wq) {
|
||||
dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
|
||||
goto failed_event_handler;
|
||||
}
|
||||
|
||||
INIT_WORK(&card->event_work, card_event_handler);
|
||||
|
||||
st = rsxx_setup_dev(card);
|
||||
@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev,
|
||||
if (st)
|
||||
dev_crit(CARD_TO_DEV(card),
|
||||
"Failed issuing card startup\n");
|
||||
if (sync_start) {
|
||||
sync_timeout = SYNC_START_TIMEOUT;
|
||||
|
||||
dev_info(CARD_TO_DEV(card),
|
||||
"Waiting for card to startup\n");
|
||||
|
||||
do {
|
||||
ssleep(1);
|
||||
sync_timeout--;
|
||||
|
||||
rsxx_get_card_state(card, &card->state);
|
||||
} while (sync_timeout &&
|
||||
(card->state == CARD_STATE_STARTING));
|
||||
|
||||
if (card->state == CARD_STATE_STARTING) {
|
||||
dev_warn(CARD_TO_DEV(card),
|
||||
"Card startup timed out\n");
|
||||
card->size8 = 0;
|
||||
} else {
|
||||
dev_info(CARD_TO_DEV(card),
|
||||
"card state: %s\n",
|
||||
rsxx_card_state_to_str(card->state));
|
||||
st = rsxx_get_card_size8(card, &card->size8);
|
||||
if (st)
|
||||
card->size8 = 0;
|
||||
}
|
||||
}
|
||||
} else if (card->state == CARD_STATE_GOOD ||
|
||||
card->state == CARD_STATE_RD_ONLY_FAULT) {
|
||||
st = rsxx_get_card_size8(card, &card->size8);
|
||||
@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev,
|
||||
|
||||
rsxx_attach_dev(card);
|
||||
|
||||
/************* Setup Debugfs *************/
|
||||
rsxx_debugfs_dev_new(card);
|
||||
|
||||
return 0;
|
||||
|
||||
failed_create_dev:
|
||||
destroy_workqueue(card->event_wq);
|
||||
card->event_wq = NULL;
|
||||
failed_event_handler:
|
||||
rsxx_dma_destroy(card);
|
||||
failed_dma_setup:
|
||||
failed_compatiblity_check:
|
||||
destroy_workqueue(card->creg_ctrl.creg_wq);
|
||||
card->creg_ctrl.creg_wq = NULL;
|
||||
failed_creg_setup:
|
||||
spin_lock_irq(&card->irq_lock);
|
||||
rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
|
||||
spin_unlock_irq(&card->irq_lock);
|
||||
@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev)
|
||||
/* Prevent work_structs from re-queuing themselves. */
|
||||
card->halt = 1;
|
||||
|
||||
debugfs_remove_recursive(card->debugfs_dir);
|
||||
|
||||
free_irq(dev->irq, card);
|
||||
|
||||
if (!force_legacy)
|
||||
|
@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card,
|
||||
*hw_stat = completion.creg_status;
|
||||
|
||||
if (completion.st) {
|
||||
/*
|
||||
* This read is needed to verify that there has not been any
|
||||
* extreme errors that might have occurred, i.e. EEH. The
|
||||
* function iowrite32 will not detect EEH errors, so it is
|
||||
* necessary that we recover if such an error is the reason
|
||||
* for the timeout. This is a dummy read.
|
||||
*/
|
||||
ioread32(card->regmap + SCRATCH);
|
||||
|
||||
dev_warn(CARD_TO_DEV(card),
|
||||
"creg command failed(%d x%08x)\n",
|
||||
completion.st, addr);
|
||||
@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card)
|
||||
{
|
||||
card->creg_ctrl.active_cmd = NULL;
|
||||
|
||||
card->creg_ctrl.creg_wq =
|
||||
create_singlethread_workqueue(DRIVER_NAME"_creg");
|
||||
if (!card->creg_ctrl.creg_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
|
||||
mutex_init(&card->creg_ctrl.reset_lock);
|
||||
INIT_LIST_HEAD(&card->creg_ctrl.queue);
|
||||
|
@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
|
||||
atomic_set(&meta->error, 1);
|
||||
|
||||
if (atomic_dec_and_test(&meta->pending_dmas)) {
|
||||
disk_stats_complete(card, meta->bio, meta->start_time);
|
||||
if (!card->eeh_state && card->gendisk)
|
||||
disk_stats_complete(card, meta->bio, meta->start_time);
|
||||
|
||||
bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
|
||||
kmem_cache_free(bio_meta_pool, meta);
|
||||
@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
|
||||
|
||||
might_sleep();
|
||||
|
||||
if (!card)
|
||||
goto req_err;
|
||||
|
||||
if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk))
|
||||
goto req_err;
|
||||
|
||||
if (unlikely(card->halt)) {
|
||||
st = -EFAULT;
|
||||
goto req_err;
|
||||
@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
|
||||
atomic_set(&bio_meta->pending_dmas, 0);
|
||||
bio_meta->start_time = jiffies;
|
||||
|
||||
disk_stats_start(card, bio);
|
||||
if (!unlikely(card->halt))
|
||||
disk_stats_start(card, bio);
|
||||
|
||||
dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
|
||||
bio_data_dir(bio) ? 'W' : 'R', bio_meta,
|
||||
@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
|
||||
return (pci_rev >= RSXX_DISCARD_SUPPORT);
|
||||
}
|
||||
|
||||
static unsigned short rsxx_get_logical_block_size(
|
||||
struct rsxx_cardinfo *card)
|
||||
{
|
||||
u32 capabilities = 0;
|
||||
int st;
|
||||
|
||||
st = rsxx_get_card_capabilities(card, &capabilities);
|
||||
if (st)
|
||||
dev_warn(CARD_TO_DEV(card),
|
||||
"Failed reading card capabilities register\n");
|
||||
|
||||
/* Earlier firmware did not have support for 512 byte accesses */
|
||||
if (capabilities & CARD_CAP_SUBPAGE_WRITES)
|
||||
return 512;
|
||||
else
|
||||
return RSXX_HW_BLK_SIZE;
|
||||
}
|
||||
|
||||
int rsxx_attach_dev(struct rsxx_cardinfo *card)
|
||||
{
|
||||
mutex_lock(&card->dev_lock);
|
||||
@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
blk_size = rsxx_get_logical_block_size(card);
|
||||
blk_size = card->config.data.block_size;
|
||||
|
||||
blk_queue_make_request(card->queue, rsxx_make_request);
|
||||
blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
|
||||
@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card)
|
||||
card->gendisk = NULL;
|
||||
|
||||
blk_cleanup_queue(card->queue);
|
||||
card->queue->queuedata = NULL;
|
||||
unregister_blkdev(card->major, DRIVER_NAME);
|
||||
}
|
||||
|
||||
|
@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
|
||||
kmem_cache_free(rsxx_dma_pool, dma);
|
||||
}
|
||||
|
||||
int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
|
||||
struct list_head *q)
|
||||
{
|
||||
struct rsxx_dma *dma;
|
||||
struct rsxx_dma *tmp;
|
||||
int cnt = 0;
|
||||
|
||||
list_for_each_entry_safe(dma, tmp, q, list) {
|
||||
list_del(&dma->list);
|
||||
rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
|
||||
cnt++;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
|
||||
struct rsxx_dma *dma)
|
||||
{
|
||||
@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
|
||||
* Requeued DMAs go to the front of the queue so they are issued
|
||||
* first.
|
||||
*/
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
spin_lock_bh(&ctrl->queue_lock);
|
||||
ctrl->stats.sw_q_depth++;
|
||||
list_add(&dma->list, &ctrl->queue);
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
}
|
||||
|
||||
static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
|
||||
@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
|
||||
static void dma_engine_stalled(unsigned long data)
|
||||
{
|
||||
struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
|
||||
int cnt;
|
||||
|
||||
if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
|
||||
unlikely(ctrl->card->eeh_state))
|
||||
@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data)
|
||||
"DMA channel %d has stalled, faulting interface.\n",
|
||||
ctrl->id);
|
||||
ctrl->card->dma_fault = 1;
|
||||
|
||||
/* Clean up the DMA queue */
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
|
||||
cnt += rsxx_dma_cancel(ctrl);
|
||||
|
||||
if (cnt)
|
||||
dev_info(CARD_TO_DEV(ctrl->card),
|
||||
"Freed %d queued DMAs on channel %d\n",
|
||||
cnt, ctrl->id);
|
||||
}
|
||||
}
|
||||
|
||||
static void rsxx_issue_dmas(struct work_struct *work)
|
||||
static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
|
||||
{
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
struct rsxx_dma *dma;
|
||||
int tag;
|
||||
int cmds_pending = 0;
|
||||
struct hw_cmd *hw_cmd_buf;
|
||||
|
||||
ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
|
||||
hw_cmd_buf = ctrl->cmd.buf;
|
||||
|
||||
if (unlikely(ctrl->card->halt) ||
|
||||
@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work)
|
||||
return;
|
||||
|
||||
while (1) {
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
spin_lock_bh(&ctrl->queue_lock);
|
||||
if (list_empty(&ctrl->queue)) {
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
break;
|
||||
}
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
|
||||
tag = pop_tracker(ctrl->trackers);
|
||||
if (tag == -1)
|
||||
break;
|
||||
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
spin_lock_bh(&ctrl->queue_lock);
|
||||
dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
|
||||
list_del(&dma->list);
|
||||
ctrl->stats.sw_q_depth--;
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
|
||||
/*
|
||||
* This will catch any DMAs that slipped in right before the
|
||||
@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
static void rsxx_dma_done(struct work_struct *work)
|
||||
static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
|
||||
{
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
struct rsxx_dma *dma;
|
||||
unsigned long flags;
|
||||
u16 count;
|
||||
@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work)
|
||||
u8 tag;
|
||||
struct hw_status *hw_st_buf;
|
||||
|
||||
ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
|
||||
hw_st_buf = ctrl->status.buf;
|
||||
|
||||
if (unlikely(ctrl->card->halt) ||
|
||||
@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work)
|
||||
rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
|
||||
spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
|
||||
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
spin_lock_bh(&ctrl->queue_lock);
|
||||
if (ctrl->stats.sw_q_depth)
|
||||
queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
}
|
||||
|
||||
static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card,
|
||||
struct list_head *q)
|
||||
static void rsxx_schedule_issue(struct work_struct *work)
|
||||
{
|
||||
struct rsxx_dma *dma;
|
||||
struct rsxx_dma *tmp;
|
||||
int cnt = 0;
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
|
||||
list_for_each_entry_safe(dma, tmp, q, list) {
|
||||
list_del(&dma->list);
|
||||
ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
|
||||
|
||||
if (dma->dma_addr)
|
||||
pci_unmap_page(card->dev, dma->dma_addr,
|
||||
get_dma_size(dma),
|
||||
(dma->cmd == HW_CMD_BLK_WRITE) ?
|
||||
PCI_DMA_TODEVICE :
|
||||
PCI_DMA_FROMDEVICE);
|
||||
kmem_cache_free(rsxx_dma_pool, dma);
|
||||
cnt++;
|
||||
}
|
||||
mutex_lock(&ctrl->work_lock);
|
||||
rsxx_issue_dmas(ctrl);
|
||||
mutex_unlock(&ctrl->work_lock);
|
||||
}
|
||||
|
||||
return cnt;
|
||||
static void rsxx_schedule_done(struct work_struct *work)
|
||||
{
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
|
||||
ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
|
||||
|
||||
mutex_lock(&ctrl->work_lock);
|
||||
rsxx_dma_done(ctrl);
|
||||
mutex_unlock(&ctrl->work_lock);
|
||||
}
|
||||
|
||||
static int rsxx_queue_discard(struct rsxx_cardinfo *card,
|
||||
@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
if (!list_empty(&dma_list[i])) {
|
||||
spin_lock(&card->ctrl[i].queue_lock);
|
||||
spin_lock_bh(&card->ctrl[i].queue_lock);
|
||||
card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
|
||||
list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
|
||||
spin_unlock(&card->ctrl[i].queue_lock);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
|
||||
queue_work(card->ctrl[i].issue_wq,
|
||||
&card->ctrl[i].issue_dma_work);
|
||||
@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
|
||||
return 0;
|
||||
|
||||
bvec_err:
|
||||
for (i = 0; i < card->n_targets; i++)
|
||||
rsxx_cleanup_dma_queue(card, &dma_list[i]);
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
spin_lock_bh(&card->ctrl[i].queue_lock);
|
||||
rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
}
|
||||
|
||||
return st;
|
||||
}
|
||||
@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
|
||||
spin_lock_init(&ctrl->trackers->lock);
|
||||
|
||||
spin_lock_init(&ctrl->queue_lock);
|
||||
mutex_init(&ctrl->work_lock);
|
||||
INIT_LIST_HEAD(&ctrl->queue);
|
||||
|
||||
setup_timer(&ctrl->activity_timer, dma_engine_stalled,
|
||||
@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
|
||||
if (!ctrl->done_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas);
|
||||
INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done);
|
||||
INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
|
||||
INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
|
||||
|
||||
st = rsxx_hw_buffers_init(dev, ctrl);
|
||||
if (st)
|
||||
@ -918,13 +947,30 @@ failed_dma_setup:
|
||||
return st;
|
||||
}
|
||||
|
||||
int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
|
||||
{
|
||||
struct rsxx_dma *dma;
|
||||
int i;
|
||||
int cnt = 0;
|
||||
|
||||
/* Clean up issued DMAs */
|
||||
for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
|
||||
dma = get_tracker_dma(ctrl->trackers, i);
|
||||
if (dma) {
|
||||
atomic_dec(&ctrl->stats.hw_q_depth);
|
||||
rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
|
||||
push_tracker(ctrl->trackers, i);
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
void rsxx_dma_destroy(struct rsxx_cardinfo *card)
|
||||
{
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
struct rsxx_dma *dma;
|
||||
int i, j;
|
||||
int cnt = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
ctrl = &card->ctrl[i];
|
||||
@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
|
||||
del_timer_sync(&ctrl->activity_timer);
|
||||
|
||||
/* Clean up the DMA queue */
|
||||
spin_lock(&ctrl->queue_lock);
|
||||
cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue);
|
||||
spin_unlock(&ctrl->queue_lock);
|
||||
spin_lock_bh(&ctrl->queue_lock);
|
||||
rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
|
||||
spin_unlock_bh(&ctrl->queue_lock);
|
||||
|
||||
if (cnt)
|
||||
dev_info(CARD_TO_DEV(card),
|
||||
"Freed %d queued DMAs on channel %d\n",
|
||||
cnt, i);
|
||||
|
||||
/* Clean up issued DMAs */
|
||||
for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
|
||||
dma = get_tracker_dma(ctrl->trackers, j);
|
||||
if (dma) {
|
||||
pci_unmap_page(card->dev, dma->dma_addr,
|
||||
get_dma_size(dma),
|
||||
(dma->cmd == HW_CMD_BLK_WRITE) ?
|
||||
PCI_DMA_TODEVICE :
|
||||
PCI_DMA_FROMDEVICE);
|
||||
kmem_cache_free(rsxx_dma_pool, dma);
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
if (cnt)
|
||||
dev_info(CARD_TO_DEV(card),
|
||||
"Freed %d pending DMAs on channel %d\n",
|
||||
cnt, i);
|
||||
rsxx_dma_cancel(ctrl);
|
||||
|
||||
vfree(ctrl->trackers);
|
||||
|
||||
@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
|
||||
cnt++;
|
||||
}
|
||||
|
||||
spin_lock(&card->ctrl[i].queue_lock);
|
||||
spin_lock_bh(&card->ctrl[i].queue_lock);
|
||||
list_splice(&issued_dmas[i], &card->ctrl[i].queue);
|
||||
|
||||
atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
|
||||
@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
|
||||
PCI_DMA_TODEVICE :
|
||||
PCI_DMA_FROMDEVICE);
|
||||
}
|
||||
spin_unlock(&card->ctrl[i].queue_lock);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
}
|
||||
|
||||
kfree(issued_dmas);
|
||||
@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card)
|
||||
{
|
||||
struct rsxx_dma *dma;
|
||||
struct rsxx_dma *tmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
spin_lock(&card->ctrl[i].queue_lock);
|
||||
list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) {
|
||||
list_del(&dma->list);
|
||||
|
||||
rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED);
|
||||
}
|
||||
spin_unlock(&card->ctrl[i].queue_lock);
|
||||
}
|
||||
}
|
||||
|
||||
int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
|
||||
{
|
||||
struct rsxx_dma *dma;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
spin_lock(&card->ctrl[i].queue_lock);
|
||||
spin_lock_bh(&card->ctrl[i].queue_lock);
|
||||
list_for_each_entry(dma, &card->ctrl[i].queue, list) {
|
||||
dma->dma_addr = pci_map_page(card->dev, dma->page,
|
||||
dma->pg_off, get_dma_size(dma),
|
||||
@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
|
||||
PCI_DMA_TODEVICE :
|
||||
PCI_DMA_FROMDEVICE);
|
||||
if (!dma->dma_addr) {
|
||||
spin_unlock(&card->ctrl[i].queue_lock);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
kmem_cache_free(rsxx_dma_pool, dma);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
spin_unlock(&card->ctrl[i].queue_lock);
|
||||
spin_unlock_bh(&card->ctrl[i].queue_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/ioctl.h>
|
||||
#include <linux/delay.h>
|
||||
|
||||
#include "rsxx.h"
|
||||
#include "rsxx_cfg.h"
|
||||
@ -114,6 +115,7 @@ struct rsxx_dma_ctrl {
|
||||
struct timer_list activity_timer;
|
||||
struct dma_tracker_list *trackers;
|
||||
struct rsxx_dma_stats stats;
|
||||
struct mutex work_lock;
|
||||
};
|
||||
|
||||
struct rsxx_cardinfo {
|
||||
@ -134,6 +136,7 @@ struct rsxx_cardinfo {
|
||||
spinlock_t lock;
|
||||
bool active;
|
||||
struct creg_cmd *active_cmd;
|
||||
struct workqueue_struct *creg_wq;
|
||||
struct work_struct done_work;
|
||||
struct list_head queue;
|
||||
unsigned int q_depth;
|
||||
@ -154,6 +157,7 @@ struct rsxx_cardinfo {
|
||||
int buf_len;
|
||||
} log;
|
||||
|
||||
struct workqueue_struct *event_wq;
|
||||
struct work_struct event_work;
|
||||
unsigned int state;
|
||||
u64 size8;
|
||||
@ -181,6 +185,8 @@ struct rsxx_cardinfo {
|
||||
|
||||
int n_targets;
|
||||
struct rsxx_dma_ctrl *ctrl;
|
||||
|
||||
struct dentry *debugfs_dir;
|
||||
};
|
||||
|
||||
enum rsxx_pci_regmap {
|
||||
@ -283,6 +289,7 @@ enum rsxx_creg_addr {
|
||||
CREG_ADD_CAPABILITIES = 0x80001050,
|
||||
CREG_ADD_LOG = 0x80002000,
|
||||
CREG_ADD_NUM_TARGETS = 0x80003000,
|
||||
CREG_ADD_CRAM = 0xA0000000,
|
||||
CREG_ADD_CONFIG = 0xB0000000,
|
||||
};
|
||||
|
||||
@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
|
||||
int rsxx_dma_setup(struct rsxx_cardinfo *card);
|
||||
void rsxx_dma_destroy(struct rsxx_cardinfo *card);
|
||||
int rsxx_dma_init(void);
|
||||
int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
|
||||
int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
|
||||
void rsxx_dma_cleanup(void);
|
||||
void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
|
||||
int rsxx_dma_configure(struct rsxx_cardinfo *card);
|
||||
@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
|
||||
void *cb_data);
|
||||
int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
|
||||
int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
|
||||
void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card);
|
||||
int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
|
||||
|
||||
/***** cregs.c *****/
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -50,6 +50,19 @@
|
||||
__func__, __LINE__, ##args)
|
||||
|
||||
|
||||
/*
|
||||
* This is the maximum number of segments that would be allowed in indirect
|
||||
* requests. This value will also be passed to the frontend.
|
||||
*/
|
||||
#define MAX_INDIRECT_SEGMENTS 256
|
||||
|
||||
#define SEGS_PER_INDIRECT_FRAME \
|
||||
(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
|
||||
#define MAX_INDIRECT_PAGES \
|
||||
((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
|
||||
#define INDIRECT_PAGES(_segs) \
|
||||
((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
|
||||
|
||||
/* Not a real protocol. Used to generate ring structs which contain
|
||||
* the elements common to all protocols only. This way we get a
|
||||
* compiler-checkable way to use common struct elements, so we can
|
||||
@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
|
||||
uint64_t id; /* private guest value, echoed in resp */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_x86_32_request_indirect {
|
||||
uint8_t indirect_op;
|
||||
uint16_t nr_segments;
|
||||
uint64_t id;
|
||||
blkif_sector_t sector_number;
|
||||
blkif_vdev_t handle;
|
||||
uint16_t _pad1;
|
||||
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
|
||||
/*
|
||||
* The maximum number of indirect segments (and pages) that will
|
||||
* be used is determined by MAX_INDIRECT_SEGMENTS, this value
|
||||
* is also exported to the guest (via xenstore
|
||||
* feature-max-indirect-segments entry), so the frontend knows how
|
||||
* many indirect segments the backend supports.
|
||||
*/
|
||||
uint64_t _pad2; /* make it 64 byte aligned */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_x86_32_request {
|
||||
uint8_t operation; /* BLKIF_OP_??? */
|
||||
union {
|
||||
struct blkif_x86_32_request_rw rw;
|
||||
struct blkif_x86_32_request_discard discard;
|
||||
struct blkif_x86_32_request_other other;
|
||||
struct blkif_x86_32_request_indirect indirect;
|
||||
} u;
|
||||
} __attribute__((__packed__));
|
||||
|
||||
@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
|
||||
uint64_t id; /* private guest value, echoed in resp */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_x86_64_request_indirect {
|
||||
uint8_t indirect_op;
|
||||
uint16_t nr_segments;
|
||||
uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
|
||||
uint64_t id;
|
||||
blkif_sector_t sector_number;
|
||||
blkif_vdev_t handle;
|
||||
uint16_t _pad2;
|
||||
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
|
||||
/*
|
||||
* The maximum number of indirect segments (and pages) that will
|
||||
* be used is determined by MAX_INDIRECT_SEGMENTS, this value
|
||||
* is also exported to the guest (via xenstore
|
||||
* feature-max-indirect-segments entry), so the frontend knows how
|
||||
* many indirect segments the backend supports.
|
||||
*/
|
||||
uint32_t _pad3; /* make it 64 byte aligned */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_x86_64_request {
|
||||
uint8_t operation; /* BLKIF_OP_??? */
|
||||
union {
|
||||
struct blkif_x86_64_request_rw rw;
|
||||
struct blkif_x86_64_request_discard discard;
|
||||
struct blkif_x86_64_request_other other;
|
||||
struct blkif_x86_64_request_indirect indirect;
|
||||
} u;
|
||||
} __attribute__((__packed__));
|
||||
|
||||
@ -182,12 +234,26 @@ struct xen_vbd {
|
||||
|
||||
struct backend_info;
|
||||
|
||||
/* Number of available flags */
|
||||
#define PERSISTENT_GNT_FLAGS_SIZE 2
|
||||
/* This persistent grant is currently in use */
|
||||
#define PERSISTENT_GNT_ACTIVE 0
|
||||
/*
|
||||
* This persistent grant has been used, this flag is set when we remove the
|
||||
* PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
|
||||
*/
|
||||
#define PERSISTENT_GNT_WAS_ACTIVE 1
|
||||
|
||||
/* Number of requests that we can fit in a ring */
|
||||
#define XEN_BLKIF_REQS 32
|
||||
|
||||
struct persistent_gnt {
|
||||
struct page *page;
|
||||
grant_ref_t gnt;
|
||||
grant_handle_t handle;
|
||||
DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
|
||||
struct rb_node node;
|
||||
struct list_head remove_node;
|
||||
};
|
||||
|
||||
struct xen_blkif {
|
||||
@ -219,6 +285,23 @@ struct xen_blkif {
|
||||
/* tree to store persistent grants */
|
||||
struct rb_root persistent_gnts;
|
||||
unsigned int persistent_gnt_c;
|
||||
atomic_t persistent_gnt_in_use;
|
||||
unsigned long next_lru;
|
||||
|
||||
/* used by the kworker that offload work from the persistent purge */
|
||||
struct list_head persistent_purge_list;
|
||||
struct work_struct persistent_purge_work;
|
||||
|
||||
/* buffer of free pages to map grant refs */
|
||||
spinlock_t free_pages_lock;
|
||||
int free_pages_num;
|
||||
struct list_head free_pages;
|
||||
|
||||
/* List of all 'pending_req' available */
|
||||
struct list_head pending_free;
|
||||
/* And its spinlock. */
|
||||
spinlock_t pending_free_lock;
|
||||
wait_queue_head_t pending_free_wq;
|
||||
|
||||
/* statistics */
|
||||
unsigned long st_print;
|
||||
@ -231,6 +314,41 @@ struct xen_blkif {
|
||||
unsigned long long st_wr_sect;
|
||||
|
||||
wait_queue_head_t waiting_to_free;
|
||||
/* Thread shutdown wait queue. */
|
||||
wait_queue_head_t shutdown_wq;
|
||||
};
|
||||
|
||||
struct seg_buf {
|
||||
unsigned long offset;
|
||||
unsigned int nsec;
|
||||
};
|
||||
|
||||
struct grant_page {
|
||||
struct page *page;
|
||||
struct persistent_gnt *persistent_gnt;
|
||||
grant_handle_t handle;
|
||||
grant_ref_t gref;
|
||||
};
|
||||
|
||||
/*
|
||||
* Each outstanding request that we've passed to the lower device layers has a
|
||||
* 'pending_req' allocated to it. Each buffer_head that completes decrements
|
||||
* the pendcnt towards zero. When it hits zero, the specified domain has a
|
||||
* response queued for it, with the saved 'id' passed back.
|
||||
*/
|
||||
struct pending_req {
|
||||
struct xen_blkif *blkif;
|
||||
u64 id;
|
||||
int nr_pages;
|
||||
atomic_t pendcnt;
|
||||
unsigned short operation;
|
||||
int status;
|
||||
struct list_head free_list;
|
||||
struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
|
||||
/* Indirect descriptors */
|
||||
struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
|
||||
struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
|
||||
struct bio *biolist[MAX_INDIRECT_SEGMENTS];
|
||||
};
|
||||
|
||||
|
||||
@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
|
||||
|
||||
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
|
||||
int xen_blkif_schedule(void *arg);
|
||||
int xen_blkif_purge_persistent(void *arg);
|
||||
|
||||
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
|
||||
struct backend_info *be, int state);
|
||||
@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
|
||||
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
|
||||
struct blkif_x86_32_request *src)
|
||||
{
|
||||
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
|
||||
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
|
||||
dst->operation = src->operation;
|
||||
switch (src->operation) {
|
||||
case BLKIF_OP_READ:
|
||||
@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
|
||||
dst->u.discard.sector_number = src->u.discard.sector_number;
|
||||
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
|
||||
break;
|
||||
case BLKIF_OP_INDIRECT:
|
||||
dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
|
||||
dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
|
||||
dst->u.indirect.handle = src->u.indirect.handle;
|
||||
dst->u.indirect.id = src->u.indirect.id;
|
||||
dst->u.indirect.sector_number = src->u.indirect.sector_number;
|
||||
barrier();
|
||||
j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
|
||||
for (i = 0; i < j; i++)
|
||||
dst->u.indirect.indirect_grefs[i] =
|
||||
src->u.indirect.indirect_grefs[i];
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* Don't know how to translate this op. Only get the
|
||||
@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
|
||||
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
|
||||
struct blkif_x86_64_request *src)
|
||||
{
|
||||
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
|
||||
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
|
||||
dst->operation = src->operation;
|
||||
switch (src->operation) {
|
||||
case BLKIF_OP_READ:
|
||||
@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
|
||||
dst->u.discard.sector_number = src->u.discard.sector_number;
|
||||
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
|
||||
break;
|
||||
case BLKIF_OP_INDIRECT:
|
||||
dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
|
||||
dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
|
||||
dst->u.indirect.handle = src->u.indirect.handle;
|
||||
dst->u.indirect.id = src->u.indirect.id;
|
||||
dst->u.indirect.sector_number = src->u.indirect.sector_number;
|
||||
barrier();
|
||||
j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
|
||||
for (i = 0; i < j; i++)
|
||||
dst->u.indirect.indirect_grefs[i] =
|
||||
src->u.indirect.indirect_grefs[i];
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* Don't know how to translate this op. Only get the
|
||||
|
@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
|
||||
err = PTR_ERR(blkif->xenblkd);
|
||||
blkif->xenblkd = NULL;
|
||||
xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static struct xen_blkif *xen_blkif_alloc(domid_t domid)
|
||||
{
|
||||
struct xen_blkif *blkif;
|
||||
struct pending_req *req, *n;
|
||||
int i, j;
|
||||
|
||||
BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
|
||||
|
||||
blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
|
||||
if (!blkif)
|
||||
@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
|
||||
blkif->st_print = jiffies;
|
||||
init_waitqueue_head(&blkif->waiting_to_free);
|
||||
blkif->persistent_gnts.rb_node = NULL;
|
||||
spin_lock_init(&blkif->free_pages_lock);
|
||||
INIT_LIST_HEAD(&blkif->free_pages);
|
||||
blkif->free_pages_num = 0;
|
||||
atomic_set(&blkif->persistent_gnt_in_use, 0);
|
||||
|
||||
INIT_LIST_HEAD(&blkif->pending_free);
|
||||
|
||||
for (i = 0; i < XEN_BLKIF_REQS; i++) {
|
||||
req = kzalloc(sizeof(*req), GFP_KERNEL);
|
||||
if (!req)
|
||||
goto fail;
|
||||
list_add_tail(&req->free_list,
|
||||
&blkif->pending_free);
|
||||
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
|
||||
req->segments[j] = kzalloc(sizeof(*req->segments[0]),
|
||||
GFP_KERNEL);
|
||||
if (!req->segments[j])
|
||||
goto fail;
|
||||
}
|
||||
for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
|
||||
req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
|
||||
GFP_KERNEL);
|
||||
if (!req->indirect_pages[j])
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
spin_lock_init(&blkif->pending_free_lock);
|
||||
init_waitqueue_head(&blkif->pending_free_wq);
|
||||
init_waitqueue_head(&blkif->shutdown_wq);
|
||||
|
||||
return blkif;
|
||||
|
||||
fail:
|
||||
list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
|
||||
list_del(&req->free_list);
|
||||
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
|
||||
if (!req->segments[j])
|
||||
break;
|
||||
kfree(req->segments[j]);
|
||||
}
|
||||
for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
|
||||
if (!req->indirect_pages[j])
|
||||
break;
|
||||
kfree(req->indirect_pages[j]);
|
||||
}
|
||||
kfree(req);
|
||||
}
|
||||
|
||||
kmem_cache_free(xen_blkif_cachep, blkif);
|
||||
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
|
||||
@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
|
||||
{
|
||||
if (blkif->xenblkd) {
|
||||
kthread_stop(blkif->xenblkd);
|
||||
wake_up(&blkif->shutdown_wq);
|
||||
blkif->xenblkd = NULL;
|
||||
}
|
||||
|
||||
@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
|
||||
|
||||
static void xen_blkif_free(struct xen_blkif *blkif)
|
||||
{
|
||||
struct pending_req *req, *n;
|
||||
int i = 0, j;
|
||||
|
||||
if (!atomic_dec_and_test(&blkif->refcnt))
|
||||
BUG();
|
||||
|
||||
/* Check that there is no request in use */
|
||||
list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
|
||||
list_del(&req->free_list);
|
||||
|
||||
for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
|
||||
kfree(req->segments[j]);
|
||||
|
||||
for (j = 0; j < MAX_INDIRECT_PAGES; j++)
|
||||
kfree(req->indirect_pages[j]);
|
||||
|
||||
kfree(req);
|
||||
i++;
|
||||
}
|
||||
|
||||
WARN_ON(i != XEN_BLKIF_REQS);
|
||||
|
||||
kmem_cache_free(xen_blkif_cachep, blkif);
|
||||
}
|
||||
|
||||
@ -678,6 +753,11 @@ again:
|
||||
dev->nodename);
|
||||
goto abort;
|
||||
}
|
||||
err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
|
||||
MAX_INDIRECT_SEGMENTS);
|
||||
if (err)
|
||||
dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
|
||||
dev->nodename, err);
|
||||
|
||||
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
|
||||
(unsigned long long)vbd_sz(&be->blkif->vbd));
|
||||
@ -704,6 +784,11 @@ again:
|
||||
dev->nodename);
|
||||
goto abort;
|
||||
}
|
||||
err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
|
||||
bdev_physical_block_size(be->blkif->vbd.bdev));
|
||||
if (err)
|
||||
xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
|
||||
dev->nodename);
|
||||
|
||||
err = xenbus_transaction_end(xbt, 0);
|
||||
if (err == -EAGAIN)
|
||||
|
@ -74,12 +74,30 @@ struct grant {
|
||||
struct blk_shadow {
|
||||
struct blkif_request req;
|
||||
struct request *request;
|
||||
struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
|
||||
struct grant **grants_used;
|
||||
struct grant **indirect_grants;
|
||||
struct scatterlist *sg;
|
||||
};
|
||||
|
||||
struct split_bio {
|
||||
struct bio *bio;
|
||||
atomic_t pending;
|
||||
int err;
|
||||
};
|
||||
|
||||
static DEFINE_MUTEX(blkfront_mutex);
|
||||
static const struct block_device_operations xlvbd_block_fops;
|
||||
|
||||
/*
|
||||
* Maximum number of segments in indirect requests, the actual value used by
|
||||
* the frontend driver is the minimum of this value and the value provided
|
||||
* by the backend driver.
|
||||
*/
|
||||
|
||||
static unsigned int xen_blkif_max_segments = 32;
|
||||
module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
|
||||
MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
|
||||
|
||||
#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
|
||||
|
||||
/*
|
||||
@ -98,7 +116,6 @@ struct blkfront_info
|
||||
enum blkif_state connected;
|
||||
int ring_ref;
|
||||
struct blkif_front_ring ring;
|
||||
struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
|
||||
unsigned int evtchn, irq;
|
||||
struct request_queue *rq;
|
||||
struct work_struct work;
|
||||
@ -114,6 +131,7 @@ struct blkfront_info
|
||||
unsigned int discard_granularity;
|
||||
unsigned int discard_alignment;
|
||||
unsigned int feature_persistent:1;
|
||||
unsigned int max_indirect_segments;
|
||||
int is_ready;
|
||||
};
|
||||
|
||||
@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
|
||||
|
||||
#define DEV_NAME "xvd" /* name in /dev */
|
||||
|
||||
#define SEGS_PER_INDIRECT_FRAME \
|
||||
(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
|
||||
#define INDIRECT_GREFS(_segs) \
|
||||
((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
|
||||
|
||||
static int blkfront_setup_indirect(struct blkfront_info *info);
|
||||
|
||||
static int get_id_from_freelist(struct blkfront_info *info)
|
||||
{
|
||||
unsigned long free = info->shadow_free;
|
||||
@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
|
||||
struct blkif_request *ring_req;
|
||||
unsigned long id;
|
||||
unsigned int fsect, lsect;
|
||||
int i, ref;
|
||||
int i, ref, n;
|
||||
struct blkif_request_segment_aligned *segments = NULL;
|
||||
|
||||
/*
|
||||
* Used to store if we are able to queue the request by just using
|
||||
@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
|
||||
grant_ref_t gref_head;
|
||||
struct grant *gnt_list_entry = NULL;
|
||||
struct scatterlist *sg;
|
||||
int nseg, max_grefs;
|
||||
|
||||
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
|
||||
return 1;
|
||||
|
||||
/* Check if we have enought grants to allocate a requests */
|
||||
if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
|
||||
max_grefs = info->max_indirect_segments ?
|
||||
info->max_indirect_segments +
|
||||
INDIRECT_GREFS(info->max_indirect_segments) :
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST;
|
||||
|
||||
/* Check if we have enough grants to allocate a requests */
|
||||
if (info->persistent_gnts_c < max_grefs) {
|
||||
new_persistent_gnts = 1;
|
||||
if (gnttab_alloc_grant_references(
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
|
||||
max_grefs - info->persistent_gnts_c,
|
||||
&gref_head) < 0) {
|
||||
gnttab_request_free_callback(
|
||||
&info->callback,
|
||||
blkif_restart_queue_callback,
|
||||
info,
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
max_grefs);
|
||||
return 1;
|
||||
}
|
||||
} else
|
||||
@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
|
||||
id = get_id_from_freelist(info);
|
||||
info->shadow[id].request = req;
|
||||
|
||||
ring_req->u.rw.id = id;
|
||||
ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
|
||||
ring_req->u.rw.handle = info->handle;
|
||||
|
||||
ring_req->operation = rq_data_dir(req) ?
|
||||
BLKIF_OP_WRITE : BLKIF_OP_READ;
|
||||
|
||||
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
|
||||
/*
|
||||
* Ideally we can do an unordered flush-to-disk. In case the
|
||||
* backend onlysupports barriers, use that. A barrier request
|
||||
* a superset of FUA, so we can implement it the same
|
||||
* way. (It's also a FLUSH+FUA, since it is
|
||||
* guaranteed ordered WRT previous writes.)
|
||||
*/
|
||||
ring_req->operation = info->flush_op;
|
||||
}
|
||||
|
||||
if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
|
||||
/* id, sector_number and handle are set above. */
|
||||
ring_req->operation = BLKIF_OP_DISCARD;
|
||||
ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
|
||||
ring_req->u.discard.id = id;
|
||||
ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
|
||||
if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
|
||||
ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
|
||||
else
|
||||
ring_req->u.discard.flag = 0;
|
||||
} else {
|
||||
ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
|
||||
info->sg);
|
||||
BUG_ON(ring_req->u.rw.nr_segments >
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
|
||||
for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
|
||||
BUG_ON(info->max_indirect_segments == 0 &&
|
||||
req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
BUG_ON(info->max_indirect_segments &&
|
||||
req->nr_phys_segments > info->max_indirect_segments);
|
||||
nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
|
||||
ring_req->u.rw.id = id;
|
||||
if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
|
||||
/*
|
||||
* The indirect operation can only be a BLKIF_OP_READ or
|
||||
* BLKIF_OP_WRITE
|
||||
*/
|
||||
BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
|
||||
ring_req->operation = BLKIF_OP_INDIRECT;
|
||||
ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
|
||||
BLKIF_OP_WRITE : BLKIF_OP_READ;
|
||||
ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
|
||||
ring_req->u.indirect.handle = info->handle;
|
||||
ring_req->u.indirect.nr_segments = nseg;
|
||||
} else {
|
||||
ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
|
||||
ring_req->u.rw.handle = info->handle;
|
||||
ring_req->operation = rq_data_dir(req) ?
|
||||
BLKIF_OP_WRITE : BLKIF_OP_READ;
|
||||
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
|
||||
/*
|
||||
* Ideally we can do an unordered flush-to-disk. In case the
|
||||
* backend onlysupports barriers, use that. A barrier request
|
||||
* a superset of FUA, so we can implement it the same
|
||||
* way. (It's also a FLUSH+FUA, since it is
|
||||
* guaranteed ordered WRT previous writes.)
|
||||
*/
|
||||
ring_req->operation = info->flush_op;
|
||||
}
|
||||
ring_req->u.rw.nr_segments = nseg;
|
||||
}
|
||||
for_each_sg(info->shadow[id].sg, sg, nseg, i) {
|
||||
fsect = sg->offset >> 9;
|
||||
lsect = fsect + (sg->length >> 9) - 1;
|
||||
|
||||
if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
|
||||
(i % SEGS_PER_INDIRECT_FRAME == 0)) {
|
||||
if (segments)
|
||||
kunmap_atomic(segments);
|
||||
|
||||
n = i / SEGS_PER_INDIRECT_FRAME;
|
||||
gnt_list_entry = get_grant(&gref_head, info);
|
||||
info->shadow[id].indirect_grants[n] = gnt_list_entry;
|
||||
segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
|
||||
ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
|
||||
}
|
||||
|
||||
gnt_list_entry = get_grant(&gref_head, info);
|
||||
ref = gnt_list_entry->gref;
|
||||
|
||||
@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
|
||||
|
||||
BUG_ON(sg->offset + sg->length > PAGE_SIZE);
|
||||
|
||||
shared_data = kmap_atomic(
|
||||
pfn_to_page(gnt_list_entry->pfn));
|
||||
shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
|
||||
bvec_data = kmap_atomic(sg_page(sg));
|
||||
|
||||
/*
|
||||
@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
|
||||
kunmap_atomic(bvec_data);
|
||||
kunmap_atomic(shared_data);
|
||||
}
|
||||
|
||||
ring_req->u.rw.seg[i] =
|
||||
(struct blkif_request_segment) {
|
||||
.gref = ref,
|
||||
.first_sect = fsect,
|
||||
.last_sect = lsect };
|
||||
if (ring_req->operation != BLKIF_OP_INDIRECT) {
|
||||
ring_req->u.rw.seg[i] =
|
||||
(struct blkif_request_segment) {
|
||||
.gref = ref,
|
||||
.first_sect = fsect,
|
||||
.last_sect = lsect };
|
||||
} else {
|
||||
n = i % SEGS_PER_INDIRECT_FRAME;
|
||||
segments[n] =
|
||||
(struct blkif_request_segment_aligned) {
|
||||
.gref = ref,
|
||||
.first_sect = fsect,
|
||||
.last_sect = lsect };
|
||||
}
|
||||
}
|
||||
if (segments)
|
||||
kunmap_atomic(segments);
|
||||
}
|
||||
|
||||
info->ring.req_prod_pvt++;
|
||||
@ -542,7 +608,9 @@ wait:
|
||||
flush_requests(info);
|
||||
}
|
||||
|
||||
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
|
||||
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
|
||||
unsigned int physical_sector_size,
|
||||
unsigned int segments)
|
||||
{
|
||||
struct request_queue *rq;
|
||||
struct blkfront_info *info = gd->private_data;
|
||||
@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
|
||||
|
||||
/* Hard sector size and max sectors impersonate the equiv. hardware. */
|
||||
blk_queue_logical_block_size(rq, sector_size);
|
||||
blk_queue_max_hw_sectors(rq, 512);
|
||||
blk_queue_physical_block_size(rq, physical_sector_size);
|
||||
blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
|
||||
|
||||
/* Each segment in a request is up to an aligned page in size. */
|
||||
blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
|
||||
blk_queue_max_segment_size(rq, PAGE_SIZE);
|
||||
|
||||
/* Ensure a merged request will fit in a single I/O ring slot. */
|
||||
blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
blk_queue_max_segments(rq, segments);
|
||||
|
||||
/* Make sure buffer addresses are sector-aligned. */
|
||||
blk_queue_dma_alignment(rq, 511);
|
||||
@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
|
||||
static void xlvbd_flush(struct blkfront_info *info)
|
||||
{
|
||||
blk_queue_flush(info->rq, info->feature_flush);
|
||||
printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
|
||||
printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
|
||||
info->gd->disk_name,
|
||||
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
|
||||
"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
|
||||
"flush diskcache" : "barrier or flush"),
|
||||
info->feature_flush ? "enabled" : "disabled",
|
||||
info->feature_persistent ? "using persistent grants" : "");
|
||||
info->feature_flush ? "enabled;" : "disabled;",
|
||||
"persistent grants:",
|
||||
info->feature_persistent ? "enabled;" : "disabled;",
|
||||
"indirect descriptors:",
|
||||
info->max_indirect_segments ? "enabled;" : "disabled;");
|
||||
}
|
||||
|
||||
static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
|
||||
@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
|
||||
|
||||
static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
|
||||
struct blkfront_info *info,
|
||||
u16 vdisk_info, u16 sector_size)
|
||||
u16 vdisk_info, u16 sector_size,
|
||||
unsigned int physical_sector_size)
|
||||
{
|
||||
struct gendisk *gd;
|
||||
int nr_minors = 1;
|
||||
@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
|
||||
gd->driverfs_dev = &(info->xbdev->dev);
|
||||
set_capacity(gd, capacity);
|
||||
|
||||
if (xlvbd_init_blk_queue(gd, sector_size)) {
|
||||
if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
|
||||
info->max_indirect_segments ? :
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
|
||||
del_gendisk(gd);
|
||||
goto release;
|
||||
}
|
||||
@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
|
||||
{
|
||||
struct grant *persistent_gnt;
|
||||
struct grant *n;
|
||||
int i, j, segs;
|
||||
|
||||
/* Prevent new requests being issued until we fix things up. */
|
||||
spin_lock_irq(&info->io_lock);
|
||||
@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
|
||||
}
|
||||
BUG_ON(info->persistent_gnts_c != 0);
|
||||
|
||||
for (i = 0; i < BLK_RING_SIZE; i++) {
|
||||
/*
|
||||
* Clear persistent grants present in requests already
|
||||
* on the shared ring
|
||||
*/
|
||||
if (!info->shadow[i].request)
|
||||
goto free_shadow;
|
||||
|
||||
segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
|
||||
info->shadow[i].req.u.indirect.nr_segments :
|
||||
info->shadow[i].req.u.rw.nr_segments;
|
||||
for (j = 0; j < segs; j++) {
|
||||
persistent_gnt = info->shadow[i].grants_used[j];
|
||||
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
|
||||
__free_page(pfn_to_page(persistent_gnt->pfn));
|
||||
kfree(persistent_gnt);
|
||||
}
|
||||
|
||||
if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
|
||||
/*
|
||||
* If this is not an indirect operation don't try to
|
||||
* free indirect segments
|
||||
*/
|
||||
goto free_shadow;
|
||||
|
||||
for (j = 0; j < INDIRECT_GREFS(segs); j++) {
|
||||
persistent_gnt = info->shadow[i].indirect_grants[j];
|
||||
gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
|
||||
__free_page(pfn_to_page(persistent_gnt->pfn));
|
||||
kfree(persistent_gnt);
|
||||
}
|
||||
|
||||
free_shadow:
|
||||
kfree(info->shadow[i].grants_used);
|
||||
info->shadow[i].grants_used = NULL;
|
||||
kfree(info->shadow[i].indirect_grants);
|
||||
info->shadow[i].indirect_grants = NULL;
|
||||
kfree(info->shadow[i].sg);
|
||||
info->shadow[i].sg = NULL;
|
||||
}
|
||||
|
||||
/* No more gnttab callback work. */
|
||||
gnttab_cancel_free_callback(&info->callback);
|
||||
spin_unlock_irq(&info->io_lock);
|
||||
@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
|
||||
struct blkif_response *bret)
|
||||
{
|
||||
int i = 0;
|
||||
struct bio_vec *bvec;
|
||||
struct req_iterator iter;
|
||||
unsigned long flags;
|
||||
struct scatterlist *sg;
|
||||
char *bvec_data;
|
||||
void *shared_data;
|
||||
unsigned int offset = 0;
|
||||
int nseg;
|
||||
|
||||
nseg = s->req.operation == BLKIF_OP_INDIRECT ?
|
||||
s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
|
||||
|
||||
if (bret->operation == BLKIF_OP_READ) {
|
||||
/*
|
||||
@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
|
||||
* than PAGE_SIZE, we have to keep track of the current offset,
|
||||
* to be sure we are copying the data from the right shared page.
|
||||
*/
|
||||
rq_for_each_segment(bvec, s->request, iter) {
|
||||
BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
|
||||
if (bvec->bv_offset < offset)
|
||||
i++;
|
||||
BUG_ON(i >= s->req.u.rw.nr_segments);
|
||||
for_each_sg(s->sg, sg, nseg, i) {
|
||||
BUG_ON(sg->offset + sg->length > PAGE_SIZE);
|
||||
shared_data = kmap_atomic(
|
||||
pfn_to_page(s->grants_used[i]->pfn));
|
||||
bvec_data = bvec_kmap_irq(bvec, &flags);
|
||||
memcpy(bvec_data, shared_data + bvec->bv_offset,
|
||||
bvec->bv_len);
|
||||
bvec_kunmap_irq(bvec_data, &flags);
|
||||
bvec_data = kmap_atomic(sg_page(sg));
|
||||
memcpy(bvec_data + sg->offset,
|
||||
shared_data + sg->offset,
|
||||
sg->length);
|
||||
kunmap_atomic(bvec_data);
|
||||
kunmap_atomic(shared_data);
|
||||
offset = bvec->bv_offset + bvec->bv_len;
|
||||
}
|
||||
}
|
||||
/* Add the persistent grant into the list of free grants */
|
||||
for (i = 0; i < s->req.u.rw.nr_segments; i++) {
|
||||
for (i = 0; i < nseg; i++) {
|
||||
list_add(&s->grants_used[i]->node, &info->persistent_gnts);
|
||||
info->persistent_gnts_c++;
|
||||
}
|
||||
if (s->req.operation == BLKIF_OP_INDIRECT) {
|
||||
for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
|
||||
list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
|
||||
info->persistent_gnts_c++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static irqreturn_t blkif_interrupt(int irq, void *dev_id)
|
||||
@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
|
||||
SHARED_RING_INIT(sring);
|
||||
FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
|
||||
|
||||
sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
|
||||
/* Allocate memory for grants */
|
||||
err = fill_grant_buffer(info, BLK_RING_SIZE *
|
||||
BLKIF_MAX_SEGMENTS_PER_REQUEST);
|
||||
if (err)
|
||||
goto fail;
|
||||
|
||||
err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
|
||||
if (err < 0) {
|
||||
free_page((unsigned long)sring);
|
||||
@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a clone of md_trim_bio, used to split a bio into smaller ones
|
||||
*/
|
||||
static void trim_bio(struct bio *bio, int offset, int size)
|
||||
{
|
||||
/* 'bio' is a cloned bio which we need to trim to match
|
||||
* the given offset and size.
|
||||
* This requires adjusting bi_sector, bi_size, and bi_io_vec
|
||||
*/
|
||||
int i;
|
||||
struct bio_vec *bvec;
|
||||
int sofar = 0;
|
||||
|
||||
size <<= 9;
|
||||
if (offset == 0 && size == bio->bi_size)
|
||||
return;
|
||||
|
||||
bio->bi_sector += offset;
|
||||
bio->bi_size = size;
|
||||
offset <<= 9;
|
||||
clear_bit(BIO_SEG_VALID, &bio->bi_flags);
|
||||
|
||||
while (bio->bi_idx < bio->bi_vcnt &&
|
||||
bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
|
||||
/* remove this whole bio_vec */
|
||||
offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
|
||||
bio->bi_idx++;
|
||||
}
|
||||
if (bio->bi_idx < bio->bi_vcnt) {
|
||||
bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
|
||||
bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
|
||||
}
|
||||
/* avoid any complications with bi_idx being non-zero*/
|
||||
if (bio->bi_idx) {
|
||||
memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
|
||||
(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
|
||||
bio->bi_vcnt -= bio->bi_idx;
|
||||
bio->bi_idx = 0;
|
||||
}
|
||||
/* Make sure vcnt and last bv are not too big */
|
||||
bio_for_each_segment(bvec, bio, i) {
|
||||
if (sofar + bvec->bv_len > size)
|
||||
bvec->bv_len = size - sofar;
|
||||
if (bvec->bv_len == 0) {
|
||||
bio->bi_vcnt = i;
|
||||
break;
|
||||
}
|
||||
sofar += bvec->bv_len;
|
||||
}
|
||||
}
|
||||
|
||||
static void split_bio_end(struct bio *bio, int error)
|
||||
{
|
||||
struct split_bio *split_bio = bio->bi_private;
|
||||
|
||||
if (error)
|
||||
split_bio->err = error;
|
||||
|
||||
if (atomic_dec_and_test(&split_bio->pending)) {
|
||||
split_bio->bio->bi_phys_segments = 0;
|
||||
bio_endio(split_bio->bio, split_bio->err);
|
||||
kfree(split_bio);
|
||||
}
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static int blkif_recover(struct blkfront_info *info)
|
||||
{
|
||||
int i;
|
||||
struct blkif_request *req;
|
||||
struct request *req, *n;
|
||||
struct blk_shadow *copy;
|
||||
int j;
|
||||
int rc;
|
||||
struct bio *bio, *cloned_bio;
|
||||
struct bio_list bio_list, merge_bio;
|
||||
unsigned int segs, offset;
|
||||
int pending, size;
|
||||
struct split_bio *split_bio;
|
||||
struct list_head requests;
|
||||
|
||||
/* Stage 1: Make a safe copy of the shadow state. */
|
||||
copy = kmemdup(info->shadow, sizeof(info->shadow),
|
||||
@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
|
||||
info->shadow_free = info->ring.req_prod_pvt;
|
||||
info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
|
||||
|
||||
/* Stage 3: Find pending requests and requeue them. */
|
||||
rc = blkfront_setup_indirect(info);
|
||||
if (rc) {
|
||||
kfree(copy);
|
||||
return rc;
|
||||
}
|
||||
|
||||
segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
|
||||
blk_queue_max_segments(info->rq, segs);
|
||||
bio_list_init(&bio_list);
|
||||
INIT_LIST_HEAD(&requests);
|
||||
for (i = 0; i < BLK_RING_SIZE; i++) {
|
||||
/* Not in use? */
|
||||
if (!copy[i].request)
|
||||
continue;
|
||||
|
||||
/* Grab a request slot and copy shadow state into it. */
|
||||
req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
|
||||
*req = copy[i].req;
|
||||
|
||||
/* We get a new request id, and must reset the shadow state. */
|
||||
req->u.rw.id = get_id_from_freelist(info);
|
||||
memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i]));
|
||||
|
||||
if (req->operation != BLKIF_OP_DISCARD) {
|
||||
/* Rewrite any grant references invalidated by susp/resume. */
|
||||
for (j = 0; j < req->u.rw.nr_segments; j++)
|
||||
gnttab_grant_foreign_access_ref(
|
||||
req->u.rw.seg[j].gref,
|
||||
info->xbdev->otherend_id,
|
||||
pfn_to_mfn(copy[i].grants_used[j]->pfn),
|
||||
0);
|
||||
/*
|
||||
* Get the bios in the request so we can re-queue them.
|
||||
*/
|
||||
if (copy[i].request->cmd_flags &
|
||||
(REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
|
||||
/*
|
||||
* Flush operations don't contain bios, so
|
||||
* we need to requeue the whole request
|
||||
*/
|
||||
list_add(©[i].request->queuelist, &requests);
|
||||
continue;
|
||||
}
|
||||
info->shadow[req->u.rw.id].req = *req;
|
||||
|
||||
info->ring.req_prod_pvt++;
|
||||
merge_bio.head = copy[i].request->bio;
|
||||
merge_bio.tail = copy[i].request->biotail;
|
||||
bio_list_merge(&bio_list, &merge_bio);
|
||||
copy[i].request->bio = NULL;
|
||||
blk_put_request(copy[i].request);
|
||||
}
|
||||
|
||||
kfree(copy);
|
||||
|
||||
/*
|
||||
* Empty the queue, this is important because we might have
|
||||
* requests in the queue with more segments than what we
|
||||
* can handle now.
|
||||
*/
|
||||
spin_lock_irq(&info->io_lock);
|
||||
while ((req = blk_fetch_request(info->rq)) != NULL) {
|
||||
if (req->cmd_flags &
|
||||
(REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
|
||||
list_add(&req->queuelist, &requests);
|
||||
continue;
|
||||
}
|
||||
merge_bio.head = req->bio;
|
||||
merge_bio.tail = req->biotail;
|
||||
bio_list_merge(&bio_list, &merge_bio);
|
||||
req->bio = NULL;
|
||||
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
|
||||
pr_alert("diskcache flush request found!\n");
|
||||
__blk_put_request(info->rq, req);
|
||||
}
|
||||
spin_unlock_irq(&info->io_lock);
|
||||
|
||||
xenbus_switch_state(info->xbdev, XenbusStateConnected);
|
||||
|
||||
spin_lock_irq(&info->io_lock);
|
||||
@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
|
||||
/* Now safe for us to use the shared ring */
|
||||
info->connected = BLKIF_STATE_CONNECTED;
|
||||
|
||||
/* Send off requeued requests */
|
||||
flush_requests(info);
|
||||
|
||||
/* Kick any other new requests queued since we resumed */
|
||||
kick_pending_request_queues(info);
|
||||
|
||||
list_for_each_entry_safe(req, n, &requests, queuelist) {
|
||||
/* Requeue pending requests (flush or discard) */
|
||||
list_del_init(&req->queuelist);
|
||||
BUG_ON(req->nr_phys_segments > segs);
|
||||
blk_requeue_request(info->rq, req);
|
||||
}
|
||||
spin_unlock_irq(&info->io_lock);
|
||||
|
||||
while ((bio = bio_list_pop(&bio_list)) != NULL) {
|
||||
/* Traverse the list of pending bios and re-queue them */
|
||||
if (bio_segments(bio) > segs) {
|
||||
/*
|
||||
* This bio has more segments than what we can
|
||||
* handle, we have to split it.
|
||||
*/
|
||||
pending = (bio_segments(bio) + segs - 1) / segs;
|
||||
split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
|
||||
BUG_ON(split_bio == NULL);
|
||||
atomic_set(&split_bio->pending, pending);
|
||||
split_bio->bio = bio;
|
||||
for (i = 0; i < pending; i++) {
|
||||
offset = (i * segs * PAGE_SIZE) >> 9;
|
||||
size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
|
||||
(unsigned int)(bio->bi_size >> 9) - offset);
|
||||
cloned_bio = bio_clone(bio, GFP_NOIO);
|
||||
BUG_ON(cloned_bio == NULL);
|
||||
trim_bio(cloned_bio, offset, size);
|
||||
cloned_bio->bi_private = split_bio;
|
||||
cloned_bio->bi_end_io = split_bio_end;
|
||||
submit_bio(cloned_bio->bi_rw, cloned_bio);
|
||||
}
|
||||
/*
|
||||
* Now we have to wait for all those smaller bios to
|
||||
* end, so we can also end the "parent" bio.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
/* We don't need to split this bio */
|
||||
submit_bio(bio->bi_rw, bio);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
|
||||
blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
|
||||
|
||||
err = talk_to_blkback(dev, info);
|
||||
if (info->connected == BLKIF_STATE_SUSPENDED && !err)
|
||||
err = blkif_recover(info);
|
||||
|
||||
/*
|
||||
* We have to wait for the backend to switch to
|
||||
* connected state, since we want to read which
|
||||
* features it supports.
|
||||
*/
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
|
||||
kfree(type);
|
||||
}
|
||||
|
||||
static int blkfront_setup_indirect(struct blkfront_info *info)
|
||||
{
|
||||
unsigned int indirect_segments, segs;
|
||||
int err, i;
|
||||
|
||||
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
|
||||
"feature-max-indirect-segments", "%u", &indirect_segments,
|
||||
NULL);
|
||||
if (err) {
|
||||
info->max_indirect_segments = 0;
|
||||
segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
|
||||
} else {
|
||||
info->max_indirect_segments = min(indirect_segments,
|
||||
xen_blkif_max_segments);
|
||||
segs = info->max_indirect_segments;
|
||||
}
|
||||
|
||||
err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
|
||||
if (err)
|
||||
goto out_of_memory;
|
||||
|
||||
for (i = 0; i < BLK_RING_SIZE; i++) {
|
||||
info->shadow[i].grants_used = kzalloc(
|
||||
sizeof(info->shadow[i].grants_used[0]) * segs,
|
||||
GFP_NOIO);
|
||||
info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
|
||||
if (info->max_indirect_segments)
|
||||
info->shadow[i].indirect_grants = kzalloc(
|
||||
sizeof(info->shadow[i].indirect_grants[0]) *
|
||||
INDIRECT_GREFS(segs),
|
||||
GFP_NOIO);
|
||||
if ((info->shadow[i].grants_used == NULL) ||
|
||||
(info->shadow[i].sg == NULL) ||
|
||||
(info->max_indirect_segments &&
|
||||
(info->shadow[i].indirect_grants == NULL)))
|
||||
goto out_of_memory;
|
||||
sg_init_table(info->shadow[i].sg, segs);
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
|
||||
out_of_memory:
|
||||
for (i = 0; i < BLK_RING_SIZE; i++) {
|
||||
kfree(info->shadow[i].grants_used);
|
||||
info->shadow[i].grants_used = NULL;
|
||||
kfree(info->shadow[i].sg);
|
||||
info->shadow[i].sg = NULL;
|
||||
kfree(info->shadow[i].indirect_grants);
|
||||
info->shadow[i].indirect_grants = NULL;
|
||||
}
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoked when the backend is finally 'ready' (and has told produced
|
||||
* the details about the physical device - #sectors, size, etc).
|
||||
@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
|
||||
{
|
||||
unsigned long long sectors;
|
||||
unsigned long sector_size;
|
||||
unsigned int physical_sector_size;
|
||||
unsigned int binfo;
|
||||
int err;
|
||||
int barrier, flush, discard, persistent;
|
||||
@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
|
||||
set_capacity(info->gd, sectors);
|
||||
revalidate_disk(info->gd);
|
||||
|
||||
/* fall through */
|
||||
return;
|
||||
case BLKIF_STATE_SUSPENDED:
|
||||
/*
|
||||
* If we are recovering from suspension, we need to wait
|
||||
* for the backend to announce it's features before
|
||||
* reconnecting, at least we need to know if the backend
|
||||
* supports indirect descriptors, and how many.
|
||||
*/
|
||||
blkif_recover(info);
|
||||
return;
|
||||
|
||||
default:
|
||||
@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* physcial-sector-size is a newer field, so old backends may not
|
||||
* provide this. Assume physical sector size to be the same as
|
||||
* sector_size in that case.
|
||||
*/
|
||||
err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
|
||||
"physical-sector-size", "%u", &physical_sector_size);
|
||||
if (err != 1)
|
||||
physical_sector_size = sector_size;
|
||||
|
||||
info->feature_flush = 0;
|
||||
info->flush_op = 0;
|
||||
|
||||
@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
|
||||
else
|
||||
info->feature_persistent = persistent;
|
||||
|
||||
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
|
||||
err = blkfront_setup_indirect(info);
|
||||
if (err) {
|
||||
xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
|
||||
info->xbdev->otherend);
|
||||
return;
|
||||
}
|
||||
|
||||
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
|
||||
physical_sector_size);
|
||||
if (err) {
|
||||
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
|
||||
info->xbdev->otherend);
|
||||
|
@ -63,7 +63,10 @@
|
||||
#include "bcache.h"
|
||||
#include "btree.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/random.h>
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
#define MAX_IN_FLIGHT_DISCARDS 8U
|
||||
|
||||
@ -151,7 +154,7 @@ static void discard_finish(struct work_struct *w)
|
||||
mutex_unlock(&ca->set->bucket_lock);
|
||||
|
||||
closure_wake_up(&ca->set->bucket_wait);
|
||||
wake_up(&ca->set->alloc_wait);
|
||||
wake_up_process(ca->alloc_thread);
|
||||
|
||||
closure_put(&ca->set->cl);
|
||||
}
|
||||
@ -350,38 +353,30 @@ static void invalidate_buckets(struct cache *ca)
|
||||
break;
|
||||
}
|
||||
|
||||
pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
|
||||
fifo_used(&ca->free), ca->free.size,
|
||||
fifo_used(&ca->free_inc), ca->free_inc.size,
|
||||
fifo_used(&ca->unused), ca->unused.size);
|
||||
trace_bcache_alloc_invalidate(ca);
|
||||
}
|
||||
|
||||
#define allocator_wait(ca, cond) \
|
||||
do { \
|
||||
DEFINE_WAIT(__wait); \
|
||||
\
|
||||
while (1) { \
|
||||
prepare_to_wait(&ca->set->alloc_wait, \
|
||||
&__wait, TASK_INTERRUPTIBLE); \
|
||||
set_current_state(TASK_INTERRUPTIBLE); \
|
||||
if (cond) \
|
||||
break; \
|
||||
\
|
||||
mutex_unlock(&(ca)->set->bucket_lock); \
|
||||
if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
|
||||
finish_wait(&ca->set->alloc_wait, &__wait); \
|
||||
closure_return(cl); \
|
||||
} \
|
||||
if (kthread_should_stop()) \
|
||||
return 0; \
|
||||
\
|
||||
try_to_freeze(); \
|
||||
schedule(); \
|
||||
mutex_lock(&(ca)->set->bucket_lock); \
|
||||
} \
|
||||
\
|
||||
finish_wait(&ca->set->alloc_wait, &__wait); \
|
||||
__set_current_state(TASK_RUNNING); \
|
||||
} while (0)
|
||||
|
||||
void bch_allocator_thread(struct closure *cl)
|
||||
static int bch_allocator_thread(void *arg)
|
||||
{
|
||||
struct cache *ca = container_of(cl, struct cache, alloc);
|
||||
struct cache *ca = arg;
|
||||
|
||||
mutex_lock(&ca->set->bucket_lock);
|
||||
|
||||
@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
|
||||
{
|
||||
long r = -1;
|
||||
again:
|
||||
wake_up(&ca->set->alloc_wait);
|
||||
wake_up_process(ca->alloc_thread);
|
||||
|
||||
if (fifo_used(&ca->free) > ca->watermark[watermark] &&
|
||||
fifo_pop(&ca->free, r)) {
|
||||
@ -476,9 +471,7 @@ again:
|
||||
return r;
|
||||
}
|
||||
|
||||
pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
|
||||
atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
|
||||
fifo_used(&ca->free_inc), fifo_used(&ca->unused));
|
||||
trace_bcache_alloc_fail(ca);
|
||||
|
||||
if (cl) {
|
||||
closure_wait(&ca->set->bucket_wait, cl);
|
||||
@ -552,6 +545,17 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
|
||||
|
||||
/* Init */
|
||||
|
||||
int bch_cache_allocator_start(struct cache *ca)
|
||||
{
|
||||
struct task_struct *k = kthread_run(bch_allocator_thread,
|
||||
ca, "bcache_allocator");
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
ca->alloc_thread = k;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch_cache_allocator_exit(struct cache *ca)
|
||||
{
|
||||
struct discard *d;
|
||||
|
@ -178,7 +178,6 @@
|
||||
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
|
||||
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blktrace_api.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mutex.h>
|
||||
@ -388,8 +387,6 @@ struct keybuf_key {
|
||||
typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
|
||||
|
||||
struct keybuf {
|
||||
keybuf_pred_fn *key_predicate;
|
||||
|
||||
struct bkey last_scanned;
|
||||
spinlock_t lock;
|
||||
|
||||
@ -437,9 +434,12 @@ struct bcache_device {
|
||||
|
||||
/* If nonzero, we're detaching/unregistering from cache set */
|
||||
atomic_t detaching;
|
||||
int flush_done;
|
||||
|
||||
uint64_t nr_stripes;
|
||||
unsigned stripe_size_bits;
|
||||
atomic_t *stripe_sectors_dirty;
|
||||
|
||||
atomic_long_t sectors_dirty;
|
||||
unsigned long sectors_dirty_gc;
|
||||
unsigned long sectors_dirty_last;
|
||||
long sectors_dirty_derivative;
|
||||
|
||||
@ -531,6 +531,7 @@ struct cached_dev {
|
||||
unsigned sequential_merge:1;
|
||||
unsigned verify:1;
|
||||
|
||||
unsigned partial_stripes_expensive:1;
|
||||
unsigned writeback_metadata:1;
|
||||
unsigned writeback_running:1;
|
||||
unsigned char writeback_percent;
|
||||
@ -565,8 +566,7 @@ struct cache {
|
||||
|
||||
unsigned watermark[WATERMARK_MAX];
|
||||
|
||||
struct closure alloc;
|
||||
struct workqueue_struct *alloc_workqueue;
|
||||
struct task_struct *alloc_thread;
|
||||
|
||||
struct closure prio;
|
||||
struct prio_set *disk_buckets;
|
||||
@ -664,13 +664,9 @@ struct gc_stat {
|
||||
* CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
|
||||
* we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
|
||||
* flushing dirty data).
|
||||
*
|
||||
* CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down
|
||||
* the allocation thread.
|
||||
*/
|
||||
#define CACHE_SET_UNREGISTERING 0
|
||||
#define CACHE_SET_STOPPING 1
|
||||
#define CACHE_SET_STOPPING_2 2
|
||||
|
||||
struct cache_set {
|
||||
struct closure cl;
|
||||
@ -703,9 +699,6 @@ struct cache_set {
|
||||
/* For the btree cache */
|
||||
struct shrinker shrink;
|
||||
|
||||
/* For the allocator itself */
|
||||
wait_queue_head_t alloc_wait;
|
||||
|
||||
/* For the btree cache and anything allocation related */
|
||||
struct mutex bucket_lock;
|
||||
|
||||
@ -823,10 +816,9 @@ struct cache_set {
|
||||
|
||||
/*
|
||||
* A btree node on disk could have too many bsets for an iterator to fit
|
||||
* on the stack - this is a single element mempool for btree_read_work()
|
||||
* on the stack - have to dynamically allocate them
|
||||
*/
|
||||
struct mutex fill_lock;
|
||||
struct btree_iter *fill_iter;
|
||||
mempool_t *fill_iter;
|
||||
|
||||
/*
|
||||
* btree_sort() is a merge sort and requires temporary space - single
|
||||
@ -834,6 +826,7 @@ struct cache_set {
|
||||
*/
|
||||
struct mutex sort_lock;
|
||||
struct bset *sort;
|
||||
unsigned sort_crit_factor;
|
||||
|
||||
/* List of buckets we're currently writing data to */
|
||||
struct list_head data_buckets;
|
||||
@ -906,8 +899,6 @@ static inline unsigned local_clock_us(void)
|
||||
return local_clock() >> 10;
|
||||
}
|
||||
|
||||
#define MAX_BSETS 4U
|
||||
|
||||
#define BTREE_PRIO USHRT_MAX
|
||||
#define INITIAL_PRIO 32768
|
||||
|
||||
@ -1112,23 +1103,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
|
||||
atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
|
||||
}
|
||||
|
||||
/* Blktrace macros */
|
||||
|
||||
#define blktrace_msg(c, fmt, ...) \
|
||||
do { \
|
||||
struct request_queue *q = bdev_get_queue(c->bdev); \
|
||||
if (q) \
|
||||
blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define blktrace_msg_all(s, fmt, ...) \
|
||||
do { \
|
||||
struct cache *_c; \
|
||||
unsigned i; \
|
||||
for_each_cache(_c, (s), i) \
|
||||
blktrace_msg(_c, fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
static inline void cached_dev_put(struct cached_dev *dc)
|
||||
{
|
||||
if (atomic_dec_and_test(&dc->count))
|
||||
@ -1173,10 +1147,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
|
||||
static struct kobj_attribute ksysfs_##n = \
|
||||
__ATTR(n, S_IWUSR|S_IRUSR, show, store)
|
||||
|
||||
/* Forward declarations */
|
||||
static inline void wake_up_allocators(struct cache_set *c)
|
||||
{
|
||||
struct cache *ca;
|
||||
unsigned i;
|
||||
|
||||
void bch_writeback_queue(struct cached_dev *);
|
||||
void bch_writeback_add(struct cached_dev *, unsigned);
|
||||
for_each_cache(ca, c, i)
|
||||
wake_up_process(ca->alloc_thread);
|
||||
}
|
||||
|
||||
/* Forward declarations */
|
||||
|
||||
void bch_count_io_errors(struct cache *, int, const char *);
|
||||
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
|
||||
@ -1193,7 +1173,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
|
||||
uint8_t bch_inc_gen(struct cache *, struct bucket *);
|
||||
void bch_rescale_priorities(struct cache_set *, int);
|
||||
bool bch_bucket_add_unused(struct cache *, struct bucket *);
|
||||
void bch_allocator_thread(struct closure *);
|
||||
|
||||
long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
|
||||
void bch_bucket_free(struct cache_set *, struct bkey *);
|
||||
@ -1241,9 +1220,9 @@ void bch_cache_set_stop(struct cache_set *);
|
||||
struct cache_set *bch_cache_set_alloc(struct cache_sb *);
|
||||
void bch_btree_cache_free(struct cache_set *);
|
||||
int bch_btree_cache_alloc(struct cache_set *);
|
||||
void bch_cached_dev_writeback_init(struct cached_dev *);
|
||||
void bch_moving_init_cache_set(struct cache_set *);
|
||||
|
||||
int bch_cache_allocator_start(struct cache *ca);
|
||||
void bch_cache_allocator_exit(struct cache *ca);
|
||||
int bch_cache_allocator_init(struct cache *ca);
|
||||
|
||||
|
@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
|
||||
bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
|
||||
{
|
||||
unsigned i;
|
||||
char buf[80];
|
||||
|
||||
if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
|
||||
goto bad;
|
||||
@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
|
||||
|
||||
return false;
|
||||
bad:
|
||||
cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
|
||||
bch_bkey_to_text(buf, sizeof(buf), k);
|
||||
cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
|
||||
#ifdef CONFIG_BCACHE_EDEBUG
|
||||
bug:
|
||||
mutex_unlock(&b->c->bucket_lock);
|
||||
btree_bug(b,
|
||||
|
||||
{
|
||||
char buf[80];
|
||||
|
||||
bch_bkey_to_text(buf, sizeof(buf), k);
|
||||
btree_bug(b,
|
||||
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
|
||||
pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
|
||||
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
|
||||
buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
|
||||
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
|
||||
new->sets->size = 0;
|
||||
}
|
||||
|
||||
#define SORT_CRIT (4096 / sizeof(uint64_t))
|
||||
|
||||
void bch_btree_sort_lazy(struct btree *b)
|
||||
{
|
||||
if (b->nsets) {
|
||||
unsigned i, j, keys = 0, total;
|
||||
unsigned crit = SORT_CRIT;
|
||||
int i;
|
||||
|
||||
for (i = 0; i <= b->nsets; i++)
|
||||
keys += b->sets[i].data->keys;
|
||||
/* Don't sort if nothing to do */
|
||||
if (!b->nsets)
|
||||
goto out;
|
||||
|
||||
total = keys;
|
||||
/* If not a leaf node, always sort */
|
||||
if (b->level) {
|
||||
bch_btree_sort(b);
|
||||
return;
|
||||
}
|
||||
|
||||
for (j = 0; j < b->nsets; j++) {
|
||||
if (keys * 2 < total ||
|
||||
keys < 1000) {
|
||||
bch_btree_sort_partial(b, j);
|
||||
return;
|
||||
}
|
||||
for (i = b->nsets - 1; i >= 0; --i) {
|
||||
crit *= b->c->sort_crit_factor;
|
||||
|
||||
keys -= b->sets[j].data->keys;
|
||||
}
|
||||
|
||||
/* Must sort if b->nsets == 3 or we'll overflow */
|
||||
if (b->nsets >= (MAX_BSETS - 1) - b->level) {
|
||||
bch_btree_sort(b);
|
||||
if (b->sets[i].data->keys < crit) {
|
||||
bch_btree_sort_partial(b, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Sort if we'd overflow */
|
||||
if (b->nsets + 1 == MAX_BSETS) {
|
||||
bch_btree_sort(b);
|
||||
return;
|
||||
}
|
||||
|
||||
out:
|
||||
bset_build_written_tree(b);
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
#ifndef _BCACHE_BSET_H
|
||||
#define _BCACHE_BSET_H
|
||||
|
||||
#include <linux/slab.h>
|
||||
|
||||
/*
|
||||
* BKEYS:
|
||||
*
|
||||
@ -142,6 +144,8 @@
|
||||
|
||||
/* Btree key comparison/iteration */
|
||||
|
||||
#define MAX_BSETS 4U
|
||||
|
||||
struct btree_iter {
|
||||
size_t size, used;
|
||||
struct btree_iter_set {
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/bitops.h>
|
||||
@ -134,44 +135,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i)
|
||||
return crc ^ 0xffffffffffffffffULL;
|
||||
}
|
||||
|
||||
static void btree_bio_endio(struct bio *bio, int error)
|
||||
static void bch_btree_node_read_done(struct btree *b)
|
||||
{
|
||||
struct closure *cl = bio->bi_private;
|
||||
struct btree *b = container_of(cl, struct btree, io.cl);
|
||||
|
||||
if (error)
|
||||
set_btree_node_io_error(b);
|
||||
|
||||
bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
|
||||
? "writing btree" : "reading btree");
|
||||
closure_put(cl);
|
||||
}
|
||||
|
||||
static void btree_bio_init(struct btree *b)
|
||||
{
|
||||
BUG_ON(b->bio);
|
||||
b->bio = bch_bbio_alloc(b->c);
|
||||
|
||||
b->bio->bi_end_io = btree_bio_endio;
|
||||
b->bio->bi_private = &b->io.cl;
|
||||
}
|
||||
|
||||
void bch_btree_read_done(struct closure *cl)
|
||||
{
|
||||
struct btree *b = container_of(cl, struct btree, io.cl);
|
||||
struct bset *i = b->sets[0].data;
|
||||
struct btree_iter *iter = b->c->fill_iter;
|
||||
const char *err = "bad btree header";
|
||||
BUG_ON(b->nsets || b->written);
|
||||
struct bset *i = b->sets[0].data;
|
||||
struct btree_iter *iter;
|
||||
|
||||
bch_bbio_free(b->bio, b->c);
|
||||
b->bio = NULL;
|
||||
|
||||
mutex_lock(&b->c->fill_lock);
|
||||
iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
|
||||
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
|
||||
iter->used = 0;
|
||||
|
||||
if (btree_node_io_error(b) ||
|
||||
!i->seq)
|
||||
if (!i->seq)
|
||||
goto err;
|
||||
|
||||
for (;
|
||||
@ -228,17 +202,8 @@ void bch_btree_read_done(struct closure *cl)
|
||||
if (b->written < btree_blocks(b))
|
||||
bch_bset_init_next(b);
|
||||
out:
|
||||
|
||||
mutex_unlock(&b->c->fill_lock);
|
||||
|
||||
spin_lock(&b->c->btree_read_time_lock);
|
||||
bch_time_stats_update(&b->c->btree_read_time, b->io_start_time);
|
||||
spin_unlock(&b->c->btree_read_time_lock);
|
||||
|
||||
smp_wmb(); /* read_done is our write lock */
|
||||
set_btree_node_read_done(b);
|
||||
|
||||
closure_return(cl);
|
||||
mempool_free(iter, b->c->fill_iter);
|
||||
return;
|
||||
err:
|
||||
set_btree_node_io_error(b);
|
||||
bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
|
||||
@ -247,48 +212,69 @@ err:
|
||||
goto out;
|
||||
}
|
||||
|
||||
void bch_btree_read(struct btree *b)
|
||||
static void btree_node_read_endio(struct bio *bio, int error)
|
||||
{
|
||||
BUG_ON(b->nsets || b->written);
|
||||
struct closure *cl = bio->bi_private;
|
||||
closure_put(cl);
|
||||
}
|
||||
|
||||
if (!closure_trylock(&b->io.cl, &b->c->cl))
|
||||
BUG();
|
||||
void bch_btree_node_read(struct btree *b)
|
||||
{
|
||||
uint64_t start_time = local_clock();
|
||||
struct closure cl;
|
||||
struct bio *bio;
|
||||
|
||||
b->io_start_time = local_clock();
|
||||
trace_bcache_btree_read(b);
|
||||
|
||||
btree_bio_init(b);
|
||||
b->bio->bi_rw = REQ_META|READ_SYNC;
|
||||
b->bio->bi_size = KEY_SIZE(&b->key) << 9;
|
||||
closure_init_stack(&cl);
|
||||
|
||||
bch_bio_map(b->bio, b->sets[0].data);
|
||||
bio = bch_bbio_alloc(b->c);
|
||||
bio->bi_rw = REQ_META|READ_SYNC;
|
||||
bio->bi_size = KEY_SIZE(&b->key) << 9;
|
||||
bio->bi_end_io = btree_node_read_endio;
|
||||
bio->bi_private = &cl;
|
||||
|
||||
pr_debug("%s", pbtree(b));
|
||||
trace_bcache_btree_read(b->bio);
|
||||
bch_submit_bbio(b->bio, b->c, &b->key, 0);
|
||||
bch_bio_map(bio, b->sets[0].data);
|
||||
|
||||
continue_at(&b->io.cl, bch_btree_read_done, system_wq);
|
||||
bch_submit_bbio(bio, b->c, &b->key, 0);
|
||||
closure_sync(&cl);
|
||||
|
||||
if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
|
||||
set_btree_node_io_error(b);
|
||||
|
||||
bch_bbio_free(bio, b->c);
|
||||
|
||||
if (btree_node_io_error(b))
|
||||
goto err;
|
||||
|
||||
bch_btree_node_read_done(b);
|
||||
|
||||
spin_lock(&b->c->btree_read_time_lock);
|
||||
bch_time_stats_update(&b->c->btree_read_time, start_time);
|
||||
spin_unlock(&b->c->btree_read_time_lock);
|
||||
|
||||
return;
|
||||
err:
|
||||
bch_cache_set_error(b->c, "io error reading bucket %lu",
|
||||
PTR_BUCKET_NR(b->c, &b->key, 0));
|
||||
}
|
||||
|
||||
static void btree_complete_write(struct btree *b, struct btree_write *w)
|
||||
{
|
||||
if (w->prio_blocked &&
|
||||
!atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
|
||||
wake_up(&b->c->alloc_wait);
|
||||
wake_up_allocators(b->c);
|
||||
|
||||
if (w->journal) {
|
||||
atomic_dec_bug(w->journal);
|
||||
__closure_wake_up(&b->c->journal.wait);
|
||||
}
|
||||
|
||||
if (w->owner)
|
||||
closure_put(w->owner);
|
||||
|
||||
w->prio_blocked = 0;
|
||||
w->journal = NULL;
|
||||
w->owner = NULL;
|
||||
}
|
||||
|
||||
static void __btree_write_done(struct closure *cl)
|
||||
static void __btree_node_write_done(struct closure *cl)
|
||||
{
|
||||
struct btree *b = container_of(cl, struct btree, io.cl);
|
||||
struct btree_write *w = btree_prev_write(b);
|
||||
@ -304,7 +290,7 @@ static void __btree_write_done(struct closure *cl)
|
||||
closure_return(cl);
|
||||
}
|
||||
|
||||
static void btree_write_done(struct closure *cl)
|
||||
static void btree_node_write_done(struct closure *cl)
|
||||
{
|
||||
struct btree *b = container_of(cl, struct btree, io.cl);
|
||||
struct bio_vec *bv;
|
||||
@ -313,10 +299,22 @@ static void btree_write_done(struct closure *cl)
|
||||
__bio_for_each_segment(bv, b->bio, n, 0)
|
||||
__free_page(bv->bv_page);
|
||||
|
||||
__btree_write_done(cl);
|
||||
__btree_node_write_done(cl);
|
||||
}
|
||||
|
||||
static void do_btree_write(struct btree *b)
|
||||
static void btree_node_write_endio(struct bio *bio, int error)
|
||||
{
|
||||
struct closure *cl = bio->bi_private;
|
||||
struct btree *b = container_of(cl, struct btree, io.cl);
|
||||
|
||||
if (error)
|
||||
set_btree_node_io_error(b);
|
||||
|
||||
bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
|
||||
closure_put(cl);
|
||||
}
|
||||
|
||||
static void do_btree_node_write(struct btree *b)
|
||||
{
|
||||
struct closure *cl = &b->io.cl;
|
||||
struct bset *i = b->sets[b->nsets].data;
|
||||
@ -325,15 +323,34 @@ static void do_btree_write(struct btree *b)
|
||||
i->version = BCACHE_BSET_VERSION;
|
||||
i->csum = btree_csum_set(b, i);
|
||||
|
||||
btree_bio_init(b);
|
||||
b->bio->bi_rw = REQ_META|WRITE_SYNC;
|
||||
b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
|
||||
BUG_ON(b->bio);
|
||||
b->bio = bch_bbio_alloc(b->c);
|
||||
|
||||
b->bio->bi_end_io = btree_node_write_endio;
|
||||
b->bio->bi_private = &b->io.cl;
|
||||
b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA;
|
||||
b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c);
|
||||
bch_bio_map(b->bio, i);
|
||||
|
||||
/*
|
||||
* If we're appending to a leaf node, we don't technically need FUA -
|
||||
* this write just needs to be persisted before the next journal write,
|
||||
* which will be marked FLUSH|FUA.
|
||||
*
|
||||
* Similarly if we're writing a new btree root - the pointer is going to
|
||||
* be in the next journal entry.
|
||||
*
|
||||
* But if we're writing a new btree node (that isn't a root) or
|
||||
* appending to a non leaf btree node, we need either FUA or a flush
|
||||
* when we write the parent with the new pointer. FUA is cheaper than a
|
||||
* flush, and writes appending to leaf nodes aren't blocking anything so
|
||||
* just make all btree node writes FUA to keep things sane.
|
||||
*/
|
||||
|
||||
bkey_copy(&k.key, &b->key);
|
||||
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
|
||||
|
||||
if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) {
|
||||
if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
|
||||
int j;
|
||||
struct bio_vec *bv;
|
||||
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
|
||||
@ -342,40 +359,41 @@ static void do_btree_write(struct btree *b)
|
||||
memcpy(page_address(bv->bv_page),
|
||||
base + j * PAGE_SIZE, PAGE_SIZE);
|
||||
|
||||
trace_bcache_btree_write(b->bio);
|
||||
bch_submit_bbio(b->bio, b->c, &k.key, 0);
|
||||
|
||||
continue_at(cl, btree_write_done, NULL);
|
||||
continue_at(cl, btree_node_write_done, NULL);
|
||||
} else {
|
||||
b->bio->bi_vcnt = 0;
|
||||
bch_bio_map(b->bio, i);
|
||||
|
||||
trace_bcache_btree_write(b->bio);
|
||||
bch_submit_bbio(b->bio, b->c, &k.key, 0);
|
||||
|
||||
closure_sync(cl);
|
||||
__btree_write_done(cl);
|
||||
__btree_node_write_done(cl);
|
||||
}
|
||||
}
|
||||
|
||||
static void __btree_write(struct btree *b)
|
||||
void bch_btree_node_write(struct btree *b, struct closure *parent)
|
||||
{
|
||||
struct bset *i = b->sets[b->nsets].data;
|
||||
|
||||
BUG_ON(current->bio_list);
|
||||
trace_bcache_btree_write(b);
|
||||
|
||||
BUG_ON(current->bio_list);
|
||||
BUG_ON(b->written >= btree_blocks(b));
|
||||
BUG_ON(b->written && !i->keys);
|
||||
BUG_ON(b->sets->data->seq != i->seq);
|
||||
bch_check_key_order(b, i);
|
||||
|
||||
closure_lock(&b->io, &b->c->cl);
|
||||
cancel_delayed_work(&b->work);
|
||||
|
||||
/* If caller isn't waiting for write, parent refcount is cache set */
|
||||
closure_lock(&b->io, parent ?: &b->c->cl);
|
||||
|
||||
clear_bit(BTREE_NODE_dirty, &b->flags);
|
||||
change_bit(BTREE_NODE_write_idx, &b->flags);
|
||||
|
||||
bch_check_key_order(b, i);
|
||||
BUG_ON(b->written && !i->keys);
|
||||
|
||||
do_btree_write(b);
|
||||
|
||||
pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
|
||||
do_btree_node_write(b);
|
||||
|
||||
b->written += set_blocks(i, b->c);
|
||||
atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
|
||||
@ -387,37 +405,31 @@ static void __btree_write(struct btree *b)
|
||||
bch_bset_init_next(b);
|
||||
}
|
||||
|
||||
static void btree_write_work(struct work_struct *w)
|
||||
static void btree_node_write_work(struct work_struct *w)
|
||||
{
|
||||
struct btree *b = container_of(to_delayed_work(w), struct btree, work);
|
||||
|
||||
down_write(&b->lock);
|
||||
rw_lock(true, b, b->level);
|
||||
|
||||
if (btree_node_dirty(b))
|
||||
__btree_write(b);
|
||||
up_write(&b->lock);
|
||||
bch_btree_node_write(b, NULL);
|
||||
rw_unlock(true, b);
|
||||
}
|
||||
|
||||
void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
|
||||
static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
|
||||
{
|
||||
struct bset *i = b->sets[b->nsets].data;
|
||||
struct btree_write *w = btree_current_write(b);
|
||||
|
||||
BUG_ON(b->written &&
|
||||
(b->written >= btree_blocks(b) ||
|
||||
i->seq != b->sets[0].data->seq ||
|
||||
!i->keys));
|
||||
BUG_ON(!b->written);
|
||||
BUG_ON(!i->keys);
|
||||
|
||||
if (!btree_node_dirty(b)) {
|
||||
set_btree_node_dirty(b);
|
||||
queue_delayed_work(btree_io_wq, &b->work,
|
||||
msecs_to_jiffies(30000));
|
||||
}
|
||||
if (!btree_node_dirty(b))
|
||||
queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
|
||||
|
||||
w->prio_blocked += b->prio_blocked;
|
||||
b->prio_blocked = 0;
|
||||
set_btree_node_dirty(b);
|
||||
|
||||
if (op && op->journal && !b->level) {
|
||||
if (op && op->journal) {
|
||||
if (w->journal &&
|
||||
journal_pin_cmp(b->c, w, op)) {
|
||||
atomic_dec_bug(w->journal);
|
||||
@ -430,23 +442,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
|
||||
}
|
||||
}
|
||||
|
||||
if (current->bio_list)
|
||||
return;
|
||||
|
||||
/* Force write if set is too big */
|
||||
if (now ||
|
||||
b->level ||
|
||||
set_bytes(i) > PAGE_SIZE - 48) {
|
||||
if (op && now) {
|
||||
/* Must wait on multiple writes */
|
||||
BUG_ON(w->owner);
|
||||
w->owner = &op->cl;
|
||||
closure_get(&op->cl);
|
||||
}
|
||||
|
||||
__btree_write(b);
|
||||
}
|
||||
BUG_ON(!b->written);
|
||||
if (set_bytes(i) > PAGE_SIZE - 48 &&
|
||||
!current->bio_list)
|
||||
bch_btree_node_write(b, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -559,7 +558,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
|
||||
init_rwsem(&b->lock);
|
||||
lockdep_set_novalidate_class(&b->lock);
|
||||
INIT_LIST_HEAD(&b->list);
|
||||
INIT_DELAYED_WORK(&b->work, btree_write_work);
|
||||
INIT_DELAYED_WORK(&b->work, btree_node_write_work);
|
||||
b->c = c;
|
||||
closure_init_unlocked(&b->io);
|
||||
|
||||
@ -582,7 +581,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
|
||||
BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
|
||||
|
||||
if (cl && btree_node_dirty(b))
|
||||
bch_btree_write(b, true, NULL);
|
||||
bch_btree_node_write(b, NULL);
|
||||
|
||||
if (cl)
|
||||
closure_wait_event_async(&b->io.wait, cl,
|
||||
@ -623,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
else if (!mutex_trylock(&c->bucket_lock))
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* It's _really_ critical that we don't free too many btree nodes - we
|
||||
* have to always leave ourselves a reserve. The reserve is how we
|
||||
* guarantee that allocating memory for a new btree node can always
|
||||
* succeed, so that inserting keys into the btree can always succeed and
|
||||
* IO can always make forward progress:
|
||||
*/
|
||||
nr /= c->btree_pages;
|
||||
nr = min_t(unsigned long, nr, mca_can_free(c));
|
||||
|
||||
@ -766,6 +772,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
|
||||
int ret = -ENOMEM;
|
||||
struct btree *i;
|
||||
|
||||
trace_bcache_btree_cache_cannibalize(c);
|
||||
|
||||
if (!cl)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@ -784,7 +792,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
|
||||
return ERR_PTR(-EAGAIN);
|
||||
}
|
||||
|
||||
/* XXX: tracepoint */
|
||||
c->try_harder = cl;
|
||||
c->try_harder_start = local_clock();
|
||||
retry:
|
||||
@ -905,6 +912,9 @@ retry:
|
||||
b = mca_find(c, k);
|
||||
|
||||
if (!b) {
|
||||
if (current->bio_list)
|
||||
return ERR_PTR(-EAGAIN);
|
||||
|
||||
mutex_lock(&c->bucket_lock);
|
||||
b = mca_alloc(c, k, level, &op->cl);
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
@ -914,7 +924,7 @@ retry:
|
||||
if (IS_ERR(b))
|
||||
return b;
|
||||
|
||||
bch_btree_read(b);
|
||||
bch_btree_node_read(b);
|
||||
|
||||
if (!write)
|
||||
downgrade_write(&b->lock);
|
||||
@ -937,15 +947,12 @@ retry:
|
||||
for (; i <= b->nsets; i++)
|
||||
prefetch(b->sets[i].data);
|
||||
|
||||
if (!closure_wait_event(&b->io.wait, &op->cl,
|
||||
btree_node_read_done(b))) {
|
||||
if (btree_node_io_error(b)) {
|
||||
rw_unlock(write, b);
|
||||
b = ERR_PTR(-EAGAIN);
|
||||
} else if (btree_node_io_error(b)) {
|
||||
rw_unlock(write, b);
|
||||
b = ERR_PTR(-EIO);
|
||||
} else
|
||||
BUG_ON(!b->written);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
BUG_ON(!b->written);
|
||||
|
||||
return b;
|
||||
}
|
||||
@ -959,7 +966,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
|
||||
if (!IS_ERR_OR_NULL(b)) {
|
||||
bch_btree_read(b);
|
||||
bch_btree_node_read(b);
|
||||
rw_unlock(true, b);
|
||||
}
|
||||
}
|
||||
@ -970,24 +977,19 @@ static void btree_node_free(struct btree *b, struct btree_op *op)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
trace_bcache_btree_node_free(b);
|
||||
|
||||
/*
|
||||
* The BUG_ON() in btree_node_get() implies that we must have a write
|
||||
* lock on parent to free or even invalidate a node
|
||||
*/
|
||||
BUG_ON(op->lock <= b->level);
|
||||
BUG_ON(b == b->c->root);
|
||||
pr_debug("bucket %s", pbtree(b));
|
||||
|
||||
if (btree_node_dirty(b))
|
||||
btree_complete_write(b, btree_current_write(b));
|
||||
clear_bit(BTREE_NODE_dirty, &b->flags);
|
||||
|
||||
if (b->prio_blocked &&
|
||||
!atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
|
||||
wake_up(&b->c->alloc_wait);
|
||||
|
||||
b->prio_blocked = 0;
|
||||
|
||||
cancel_delayed_work(&b->work);
|
||||
|
||||
mutex_lock(&b->c->bucket_lock);
|
||||
@ -1028,17 +1030,20 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
set_btree_node_read_done(b);
|
||||
b->accessed = 1;
|
||||
bch_bset_init_next(b);
|
||||
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
|
||||
trace_bcache_btree_node_alloc(b);
|
||||
return b;
|
||||
err_free:
|
||||
bch_bucket_free(c, &k.key);
|
||||
__bkey_put(c, &k.key);
|
||||
err:
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
|
||||
trace_bcache_btree_node_alloc_fail(b);
|
||||
return b;
|
||||
}
|
||||
|
||||
@ -1137,11 +1142,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
|
||||
gc->nkeys++;
|
||||
|
||||
gc->data += KEY_SIZE(k);
|
||||
if (KEY_DIRTY(k)) {
|
||||
if (KEY_DIRTY(k))
|
||||
gc->dirty += KEY_SIZE(k);
|
||||
if (d)
|
||||
d->sectors_dirty_gc += KEY_SIZE(k);
|
||||
}
|
||||
}
|
||||
|
||||
for (t = b->sets; t <= &b->sets[b->nsets]; t++)
|
||||
@ -1166,14 +1168,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
|
||||
|
||||
if (!IS_ERR_OR_NULL(n)) {
|
||||
swap(b, n);
|
||||
__bkey_put(b->c, &b->key);
|
||||
|
||||
memcpy(k->ptr, b->key.ptr,
|
||||
sizeof(uint64_t) * KEY_PTRS(&b->key));
|
||||
|
||||
__bkey_put(b->c, &b->key);
|
||||
atomic_inc(&b->c->prio_blocked);
|
||||
b->prio_blocked++;
|
||||
|
||||
btree_node_free(n, op);
|
||||
up_write(&n->lock);
|
||||
}
|
||||
@ -1278,7 +1277,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
|
||||
btree_node_free(r->b, op);
|
||||
up_write(&r->b->lock);
|
||||
|
||||
pr_debug("coalesced %u nodes", nodes);
|
||||
trace_bcache_btree_gc_coalesce(nodes);
|
||||
|
||||
gc->nodes--;
|
||||
nodes--;
|
||||
@ -1293,14 +1292,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
|
||||
void write(struct btree *r)
|
||||
{
|
||||
if (!r->written)
|
||||
bch_btree_write(r, true, op);
|
||||
else if (btree_node_dirty(r)) {
|
||||
BUG_ON(btree_current_write(r)->owner);
|
||||
btree_current_write(r)->owner = writes;
|
||||
closure_get(writes);
|
||||
|
||||
bch_btree_write(r, true, NULL);
|
||||
}
|
||||
bch_btree_node_write(r, &op->cl);
|
||||
else if (btree_node_dirty(r))
|
||||
bch_btree_node_write(r, writes);
|
||||
|
||||
up_write(&r->lock);
|
||||
}
|
||||
@ -1386,9 +1380,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
|
||||
ret = btree_gc_recurse(b, op, writes, gc);
|
||||
|
||||
if (!b->written || btree_node_dirty(b)) {
|
||||
atomic_inc(&b->c->prio_blocked);
|
||||
b->prio_blocked++;
|
||||
bch_btree_write(b, true, n ? op : NULL);
|
||||
bch_btree_node_write(b, n ? &op->cl : NULL);
|
||||
}
|
||||
|
||||
if (!IS_ERR_OR_NULL(n)) {
|
||||
@ -1405,7 +1397,6 @@ static void btree_gc_start(struct cache_set *c)
|
||||
{
|
||||
struct cache *ca;
|
||||
struct bucket *b;
|
||||
struct bcache_device **d;
|
||||
unsigned i;
|
||||
|
||||
if (!c->gc_mark_valid)
|
||||
@ -1419,16 +1410,12 @@ static void btree_gc_start(struct cache_set *c)
|
||||
for_each_cache(ca, c, i)
|
||||
for_each_bucket(b, ca) {
|
||||
b->gc_gen = b->gen;
|
||||
if (!atomic_read(&b->pin))
|
||||
if (!atomic_read(&b->pin)) {
|
||||
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
|
||||
SET_GC_SECTORS_USED(b, 0);
|
||||
}
|
||||
}
|
||||
|
||||
for (d = c->devices;
|
||||
d < c->devices + c->nr_uuids;
|
||||
d++)
|
||||
if (*d)
|
||||
(*d)->sectors_dirty_gc = 0;
|
||||
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
}
|
||||
|
||||
@ -1437,7 +1424,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
|
||||
size_t available = 0;
|
||||
struct bucket *b;
|
||||
struct cache *ca;
|
||||
struct bcache_device **d;
|
||||
unsigned i;
|
||||
|
||||
mutex_lock(&c->bucket_lock);
|
||||
@ -1480,22 +1466,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
|
||||
}
|
||||
}
|
||||
|
||||
for (d = c->devices;
|
||||
d < c->devices + c->nr_uuids;
|
||||
d++)
|
||||
if (*d) {
|
||||
unsigned long last =
|
||||
atomic_long_read(&((*d)->sectors_dirty));
|
||||
long difference = (*d)->sectors_dirty_gc - last;
|
||||
|
||||
pr_debug("sectors dirty off by %li", difference);
|
||||
|
||||
(*d)->sectors_dirty_last += difference;
|
||||
|
||||
atomic_long_set(&((*d)->sectors_dirty),
|
||||
(*d)->sectors_dirty_gc);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
return available;
|
||||
}
|
||||
@ -1508,10 +1478,9 @@ static void bch_btree_gc(struct closure *cl)
|
||||
struct gc_stat stats;
|
||||
struct closure writes;
|
||||
struct btree_op op;
|
||||
|
||||
uint64_t start_time = local_clock();
|
||||
trace_bcache_gc_start(c->sb.set_uuid);
|
||||
blktrace_msg_all(c, "Starting gc");
|
||||
|
||||
trace_bcache_gc_start(c);
|
||||
|
||||
memset(&stats, 0, sizeof(struct gc_stat));
|
||||
closure_init_stack(&writes);
|
||||
@ -1520,14 +1489,14 @@ static void bch_btree_gc(struct closure *cl)
|
||||
|
||||
btree_gc_start(c);
|
||||
|
||||
atomic_inc(&c->prio_blocked);
|
||||
|
||||
ret = btree_root(gc_root, c, &op, &writes, &stats);
|
||||
closure_sync(&op.cl);
|
||||
closure_sync(&writes);
|
||||
|
||||
if (ret) {
|
||||
blktrace_msg_all(c, "Stopped gc");
|
||||
pr_warn("gc failed!");
|
||||
|
||||
continue_at(cl, bch_btree_gc, bch_gc_wq);
|
||||
}
|
||||
|
||||
@ -1537,6 +1506,9 @@ static void bch_btree_gc(struct closure *cl)
|
||||
|
||||
available = bch_btree_gc_finish(c);
|
||||
|
||||
atomic_dec(&c->prio_blocked);
|
||||
wake_up_allocators(c);
|
||||
|
||||
bch_time_stats_update(&c->btree_gc_time, start_time);
|
||||
|
||||
stats.key_bytes *= sizeof(uint64_t);
|
||||
@ -1544,10 +1516,8 @@ static void bch_btree_gc(struct closure *cl)
|
||||
stats.data <<= 9;
|
||||
stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
|
||||
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
|
||||
blktrace_msg_all(c, "Finished gc");
|
||||
|
||||
trace_bcache_gc_end(c->sb.set_uuid);
|
||||
wake_up(&c->alloc_wait);
|
||||
trace_bcache_gc_end(c);
|
||||
|
||||
continue_at(cl, bch_moving_gc, bch_gc_wq);
|
||||
}
|
||||
@ -1654,14 +1624,14 @@ static bool fix_overlapping_extents(struct btree *b,
|
||||
struct btree_iter *iter,
|
||||
struct btree_op *op)
|
||||
{
|
||||
void subtract_dirty(struct bkey *k, int sectors)
|
||||
void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
|
||||
{
|
||||
struct bcache_device *d = b->c->devices[KEY_INODE(k)];
|
||||
|
||||
if (KEY_DIRTY(k) && d)
|
||||
atomic_long_sub(sectors, &d->sectors_dirty);
|
||||
if (KEY_DIRTY(k))
|
||||
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
|
||||
offset, -sectors);
|
||||
}
|
||||
|
||||
uint64_t old_offset;
|
||||
unsigned old_size, sectors_found = 0;
|
||||
|
||||
while (1) {
|
||||
@ -1673,6 +1643,7 @@ static bool fix_overlapping_extents(struct btree *b,
|
||||
if (bkey_cmp(k, &START_KEY(insert)) <= 0)
|
||||
continue;
|
||||
|
||||
old_offset = KEY_START(k);
|
||||
old_size = KEY_SIZE(k);
|
||||
|
||||
/*
|
||||
@ -1728,7 +1699,7 @@ static bool fix_overlapping_extents(struct btree *b,
|
||||
|
||||
struct bkey *top;
|
||||
|
||||
subtract_dirty(k, KEY_SIZE(insert));
|
||||
subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
|
||||
|
||||
if (bkey_written(b, k)) {
|
||||
/*
|
||||
@ -1775,7 +1746,7 @@ static bool fix_overlapping_extents(struct btree *b,
|
||||
}
|
||||
}
|
||||
|
||||
subtract_dirty(k, old_size - KEY_SIZE(k));
|
||||
subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
|
||||
}
|
||||
|
||||
check_failed:
|
||||
@ -1798,7 +1769,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
|
||||
{
|
||||
struct bset *i = b->sets[b->nsets].data;
|
||||
struct bkey *m, *prev;
|
||||
const char *status = "insert";
|
||||
unsigned status = BTREE_INSERT_STATUS_INSERT;
|
||||
|
||||
BUG_ON(bkey_cmp(k, &b->key) > 0);
|
||||
BUG_ON(b->level && !KEY_PTRS(k));
|
||||
@ -1831,17 +1802,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
|
||||
goto insert;
|
||||
|
||||
/* prev is in the tree, if we merge we're done */
|
||||
status = "back merging";
|
||||
status = BTREE_INSERT_STATUS_BACK_MERGE;
|
||||
if (prev &&
|
||||
bch_bkey_try_merge(b, prev, k))
|
||||
goto merged;
|
||||
|
||||
status = "overwrote front";
|
||||
status = BTREE_INSERT_STATUS_OVERWROTE;
|
||||
if (m != end(i) &&
|
||||
KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
|
||||
goto copy;
|
||||
|
||||
status = "front merge";
|
||||
status = BTREE_INSERT_STATUS_FRONT_MERGE;
|
||||
if (m != end(i) &&
|
||||
bch_bkey_try_merge(b, k, m))
|
||||
goto copy;
|
||||
@ -1851,21 +1822,21 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
|
||||
insert: shift_keys(b, m, k);
|
||||
copy: bkey_copy(m, k);
|
||||
merged:
|
||||
bch_check_keys(b, "%s for %s at %s: %s", status,
|
||||
op_type(op), pbtree(b), pkey(k));
|
||||
bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
|
||||
op_type(op), pbtree(b), pkey(k));
|
||||
if (KEY_DIRTY(k))
|
||||
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
|
||||
KEY_START(k), KEY_SIZE(k));
|
||||
|
||||
bch_check_keys(b, "%u for %s", status, op_type(op));
|
||||
|
||||
if (b->level && !KEY_OFFSET(k))
|
||||
b->prio_blocked++;
|
||||
btree_current_write(b)->prio_blocked++;
|
||||
|
||||
pr_debug("%s for %s at %s: %s", status,
|
||||
op_type(op), pbtree(b), pkey(k));
|
||||
trace_bcache_btree_insert_key(b, k, op->type, status);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
|
||||
static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
|
||||
{
|
||||
bool ret = false;
|
||||
struct bkey *k;
|
||||
@ -1896,7 +1867,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
|
||||
should_split(b))
|
||||
goto out;
|
||||
|
||||
op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
|
||||
op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
|
||||
|
||||
SET_KEY_PTRS(&op->replace, 1);
|
||||
get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
|
||||
@ -1907,7 +1878,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
|
||||
|
||||
BUG_ON(op->type != BTREE_INSERT);
|
||||
BUG_ON(!btree_insert_key(b, op, &tmp.k));
|
||||
bch_btree_write(b, false, NULL);
|
||||
ret = true;
|
||||
out:
|
||||
downgrade_write(&b->lock);
|
||||
@ -1929,12 +1899,11 @@ static int btree_split(struct btree *b, struct btree_op *op)
|
||||
|
||||
split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
|
||||
|
||||
pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
|
||||
pbtree(b), n1->sets[0].data->keys);
|
||||
|
||||
if (split) {
|
||||
unsigned keys = 0;
|
||||
|
||||
trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
|
||||
|
||||
n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
|
||||
if (IS_ERR(n2))
|
||||
goto err_free1;
|
||||
@ -1967,18 +1936,21 @@ static int btree_split(struct btree *b, struct btree_op *op)
|
||||
bkey_copy_key(&n2->key, &b->key);
|
||||
|
||||
bch_keylist_add(&op->keys, &n2->key);
|
||||
bch_btree_write(n2, true, op);
|
||||
bch_btree_node_write(n2, &op->cl);
|
||||
rw_unlock(true, n2);
|
||||
} else
|
||||
} else {
|
||||
trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
|
||||
|
||||
bch_btree_insert_keys(n1, op);
|
||||
}
|
||||
|
||||
bch_keylist_add(&op->keys, &n1->key);
|
||||
bch_btree_write(n1, true, op);
|
||||
bch_btree_node_write(n1, &op->cl);
|
||||
|
||||
if (n3) {
|
||||
bkey_copy_key(&n3->key, &MAX_KEY);
|
||||
bch_btree_insert_keys(n3, op);
|
||||
bch_btree_write(n3, true, op);
|
||||
bch_btree_node_write(n3, &op->cl);
|
||||
|
||||
closure_sync(&op->cl);
|
||||
bch_btree_set_root(n3);
|
||||
@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
|
||||
|
||||
BUG_ON(write_block(b) != b->sets[b->nsets].data);
|
||||
|
||||
if (bch_btree_insert_keys(b, op))
|
||||
bch_btree_write(b, false, op);
|
||||
if (bch_btree_insert_keys(b, op)) {
|
||||
if (!b->level)
|
||||
bch_btree_leaf_dirty(b, op);
|
||||
else
|
||||
bch_btree_node_write(b, &op->cl);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -2140,6 +2116,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
|
||||
void bch_btree_set_root(struct btree *b)
|
||||
{
|
||||
unsigned i;
|
||||
struct closure cl;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
trace_bcache_btree_set_root(b);
|
||||
|
||||
BUG_ON(!b->written);
|
||||
|
||||
@ -2153,8 +2134,8 @@ void bch_btree_set_root(struct btree *b)
|
||||
b->c->root = b;
|
||||
__bkey_put(b->c, &b->key);
|
||||
|
||||
bch_journal_meta(b->c, NULL);
|
||||
pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
|
||||
bch_journal_meta(b->c, &cl);
|
||||
closure_sync(&cl);
|
||||
}
|
||||
|
||||
/* Cache lookup */
|
||||
@ -2215,9 +2196,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
|
||||
KEY_OFFSET(k) - bio->bi_sector);
|
||||
|
||||
n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
|
||||
if (!n)
|
||||
return -EAGAIN;
|
||||
|
||||
if (n == bio)
|
||||
op->lookup_done = true;
|
||||
|
||||
@ -2240,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
|
||||
n->bi_end_io = bch_cache_read_endio;
|
||||
n->bi_private = &s->cl;
|
||||
|
||||
trace_bcache_cache_hit(n);
|
||||
__bch_submit_bbio(n, b->c);
|
||||
}
|
||||
|
||||
@ -2257,9 +2234,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
|
||||
struct btree_iter iter;
|
||||
bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
|
||||
|
||||
pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
|
||||
(uint64_t) bio->bi_sector);
|
||||
|
||||
do {
|
||||
k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
|
||||
if (!k) {
|
||||
@ -2303,7 +2277,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
|
||||
}
|
||||
|
||||
static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
|
||||
struct keybuf *buf, struct bkey *end)
|
||||
struct keybuf *buf, struct bkey *end,
|
||||
keybuf_pred_fn *pred)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch_btree_iter_init(b, &iter, &buf->last_scanned);
|
||||
@ -2322,11 +2297,9 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
|
||||
if (bkey_cmp(&buf->last_scanned, end) >= 0)
|
||||
break;
|
||||
|
||||
if (buf->key_predicate(buf, k)) {
|
||||
if (pred(buf, k)) {
|
||||
struct keybuf_key *w;
|
||||
|
||||
pr_debug("%s", pkey(k));
|
||||
|
||||
spin_lock(&buf->lock);
|
||||
|
||||
w = array_alloc(&buf->freelist);
|
||||
@ -2343,7 +2316,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
|
||||
if (!k)
|
||||
break;
|
||||
|
||||
btree(refill_keybuf, k, b, op, buf, end);
|
||||
btree(refill_keybuf, k, b, op, buf, end, pred);
|
||||
/*
|
||||
* Might get an error here, but can't really do anything
|
||||
* and it'll get logged elsewhere. Just read what we
|
||||
@ -2361,7 +2334,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
|
||||
}
|
||||
|
||||
void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
|
||||
struct bkey *end)
|
||||
struct bkey *end, keybuf_pred_fn *pred)
|
||||
{
|
||||
struct bkey start = buf->last_scanned;
|
||||
struct btree_op op;
|
||||
@ -2369,7 +2342,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
|
||||
|
||||
cond_resched();
|
||||
|
||||
btree_root(refill_keybuf, c, &op, buf, end);
|
||||
btree_root(refill_keybuf, c, &op, buf, end, pred);
|
||||
closure_sync(&op.cl);
|
||||
|
||||
pr_debug("found %s keys from %llu:%llu to %llu:%llu",
|
||||
@ -2455,7 +2428,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
|
||||
|
||||
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
|
||||
struct keybuf *buf,
|
||||
struct bkey *end)
|
||||
struct bkey *end,
|
||||
keybuf_pred_fn *pred)
|
||||
{
|
||||
struct keybuf_key *ret;
|
||||
|
||||
@ -2469,15 +2443,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
|
||||
break;
|
||||
}
|
||||
|
||||
bch_refill_keybuf(c, buf, end);
|
||||
bch_refill_keybuf(c, buf, end, pred);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
|
||||
void bch_keybuf_init(struct keybuf *buf)
|
||||
{
|
||||
buf->key_predicate = fn;
|
||||
buf->last_scanned = MAX_KEY;
|
||||
buf->keys = RB_ROOT;
|
||||
|
||||
|
@ -102,7 +102,6 @@
|
||||
#include "debug.h"
|
||||
|
||||
struct btree_write {
|
||||
struct closure *owner;
|
||||
atomic_t *journal;
|
||||
|
||||
/* If btree_split() frees a btree node, it writes a new pointer to that
|
||||
@ -142,16 +141,12 @@ struct btree {
|
||||
*/
|
||||
struct bset_tree sets[MAX_BSETS];
|
||||
|
||||
/* Used to refcount bio splits, also protects b->bio */
|
||||
/* For outstanding btree writes, used as a lock - protects write_idx */
|
||||
struct closure_with_waitlist io;
|
||||
|
||||
/* Gets transferred to w->prio_blocked - see the comment there */
|
||||
int prio_blocked;
|
||||
|
||||
struct list_head list;
|
||||
struct delayed_work work;
|
||||
|
||||
uint64_t io_start_time;
|
||||
struct btree_write writes[2];
|
||||
struct bio *bio;
|
||||
};
|
||||
@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
|
||||
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
|
||||
enum btree_flags {
|
||||
BTREE_NODE_read_done,
|
||||
BTREE_NODE_io_error,
|
||||
BTREE_NODE_dirty,
|
||||
BTREE_NODE_write_idx,
|
||||
};
|
||||
|
||||
BTREE_FLAG(read_done);
|
||||
BTREE_FLAG(io_error);
|
||||
BTREE_FLAG(dirty);
|
||||
BTREE_FLAG(write_idx);
|
||||
@ -278,6 +271,13 @@ struct btree_op {
|
||||
BKEY_PADDED(replace);
|
||||
};
|
||||
|
||||
enum {
|
||||
BTREE_INSERT_STATUS_INSERT,
|
||||
BTREE_INSERT_STATUS_BACK_MERGE,
|
||||
BTREE_INSERT_STATUS_OVERWROTE,
|
||||
BTREE_INSERT_STATUS_FRONT_MERGE,
|
||||
};
|
||||
|
||||
void bch_btree_op_init_stack(struct btree_op *);
|
||||
|
||||
static inline void rw_lock(bool w, struct btree *b, int level)
|
||||
@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
|
||||
#ifdef CONFIG_BCACHE_EDEBUG
|
||||
unsigned i;
|
||||
|
||||
if (w &&
|
||||
b->key.ptr[0] &&
|
||||
btree_node_read_done(b))
|
||||
if (w && b->key.ptr[0])
|
||||
for (i = 0; i <= b->nsets; i++)
|
||||
bch_check_key_order(b, b->sets[i].data);
|
||||
#endif
|
||||
@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
|
||||
> btree_blocks(b));
|
||||
}
|
||||
|
||||
void bch_btree_read_done(struct closure *);
|
||||
void bch_btree_read(struct btree *);
|
||||
void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
|
||||
void bch_btree_node_read(struct btree *);
|
||||
void bch_btree_node_write(struct btree *, struct closure *);
|
||||
|
||||
void bch_cannibalize_unlock(struct cache_set *, struct closure *);
|
||||
void bch_btree_set_root(struct btree *);
|
||||
@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
|
||||
struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
|
||||
int, struct btree_op *);
|
||||
|
||||
bool bch_btree_insert_keys(struct btree *, struct btree_op *);
|
||||
bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
|
||||
struct bio *);
|
||||
int bch_btree_insert(struct btree_op *, struct cache_set *);
|
||||
@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
|
||||
int bch_btree_check(struct cache_set *, struct btree_op *);
|
||||
uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
|
||||
|
||||
void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
|
||||
void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
|
||||
void bch_keybuf_init(struct keybuf *);
|
||||
void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
|
||||
keybuf_pred_fn *);
|
||||
bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
|
||||
struct bkey *);
|
||||
void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
|
||||
struct keybuf_key *bch_keybuf_next(struct keybuf *);
|
||||
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
|
||||
struct keybuf *, struct bkey *);
|
||||
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
|
||||
struct bkey *, keybuf_pred_fn *);
|
||||
|
||||
#endif
|
||||
|
@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
|
||||
} else {
|
||||
struct closure *parent = cl->parent;
|
||||
struct closure_waitlist *wait = closure_waitlist(cl);
|
||||
closure_fn *destructor = cl->fn;
|
||||
|
||||
closure_debug_destroy(cl);
|
||||
|
||||
smp_mb();
|
||||
atomic_set(&cl->remaining, -1);
|
||||
|
||||
if (wait)
|
||||
closure_wake_up(wait);
|
||||
|
||||
if (cl->fn)
|
||||
cl->fn(cl);
|
||||
if (destructor)
|
||||
destructor(cl);
|
||||
|
||||
if (parent)
|
||||
closure_put(parent);
|
||||
|
@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
|
||||
return "";
|
||||
}
|
||||
|
||||
struct keyprint_hack bch_pkey(const struct bkey *k)
|
||||
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
|
||||
{
|
||||
unsigned i = 0;
|
||||
struct keyprint_hack r;
|
||||
char *out = r.s, *end = r.s + KEYHACK_SIZE;
|
||||
char *out = buf, *end = buf + size;
|
||||
|
||||
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
|
||||
|
||||
@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
|
||||
if (KEY_CSUM(k))
|
||||
p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
|
||||
#undef p
|
||||
return r;
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
struct keyprint_hack bch_pbtree(const struct btree *b)
|
||||
int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
|
||||
{
|
||||
struct keyprint_hack r;
|
||||
|
||||
snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
|
||||
b->level, b->c->root ? b->c->root->level : -1);
|
||||
return r;
|
||||
return scnprintf(buf, size, "%zu level %i/%i",
|
||||
PTR_BUCKET_NR(b->c, &b->key, 0),
|
||||
b->level, b->c->root ? b->c->root->level : -1);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
|
||||
@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
|
||||
{
|
||||
struct bkey *k;
|
||||
unsigned j;
|
||||
char buf[80];
|
||||
|
||||
for (k = i->start; k < end(i); k = bkey_next(k)) {
|
||||
bch_bkey_to_text(buf, sizeof(buf), k);
|
||||
printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
|
||||
(uint64_t *) k - i->d, i->keys, pkey(k));
|
||||
(uint64_t *) k - i->d, i->keys, buf);
|
||||
|
||||
for (j = 0; j < KEY_PTRS(k); j++) {
|
||||
size_t n = PTR_BUCKET_NR(b->c, k, j);
|
||||
@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
|
||||
v->written = 0;
|
||||
v->level = b->level;
|
||||
|
||||
bch_btree_read(v);
|
||||
bch_btree_node_read(v);
|
||||
closure_wait_event(&v->io.wait, &cl,
|
||||
atomic_read(&b->io.cl.remaining) == -1);
|
||||
|
||||
@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
|
||||
if (!check)
|
||||
return;
|
||||
|
||||
if (bch_bio_alloc_pages(check, GFP_NOIO))
|
||||
if (bio_alloc_pages(check, GFP_NOIO))
|
||||
goto out_put;
|
||||
|
||||
check->bi_rw = READ_SYNC;
|
||||
@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
|
||||
va_list args)
|
||||
{
|
||||
unsigned i;
|
||||
char buf[80];
|
||||
|
||||
console_lock();
|
||||
|
||||
@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
|
||||
|
||||
console_unlock();
|
||||
|
||||
panic("at %s\n", pbtree(b));
|
||||
bch_btree_to_text(buf, sizeof(buf), b);
|
||||
panic("at %s\n", buf);
|
||||
}
|
||||
|
||||
void bch_check_key_order_msg(struct btree *b, struct bset *i,
|
||||
@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
|
||||
{
|
||||
struct dump_iterator *i = file->private_data;
|
||||
ssize_t ret = 0;
|
||||
char kbuf[80];
|
||||
|
||||
while (size) {
|
||||
struct keybuf_key *w;
|
||||
@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
|
||||
if (i->bytes)
|
||||
break;
|
||||
|
||||
w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
|
||||
w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
|
||||
if (!w)
|
||||
break;
|
||||
|
||||
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
|
||||
bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
|
||||
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
|
||||
bch_keybuf_del(&i->keys, w);
|
||||
}
|
||||
|
||||
@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
|
||||
|
||||
file->private_data = i;
|
||||
i->c = c;
|
||||
bch_keybuf_init(&i->keys, dump_pred);
|
||||
bch_keybuf_init(&i->keys);
|
||||
i->keys.last_scanned = KEY(0, 0, 0);
|
||||
|
||||
return 0;
|
||||
@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
|
||||
|
||||
#endif
|
||||
|
||||
/* Fuzz tester has rotted: */
|
||||
#if 0
|
||||
|
||||
static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
|
||||
const char *buffer, size_t size)
|
||||
{
|
||||
void dump(struct btree *b)
|
||||
{
|
||||
struct bset *i;
|
||||
|
||||
for (i = b->sets[0].data;
|
||||
index(i, b) < btree_blocks(b) &&
|
||||
i->seq == b->sets[0].data->seq;
|
||||
i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
|
||||
dump_bset(b, i);
|
||||
}
|
||||
|
||||
struct cache_sb *sb;
|
||||
struct cache_set *c;
|
||||
struct btree *all[3], *b, *fill, *orig;
|
||||
int j;
|
||||
|
||||
struct btree_op op;
|
||||
bch_btree_op_init_stack(&op);
|
||||
|
||||
sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
|
||||
if (!sb)
|
||||
return -ENOMEM;
|
||||
|
||||
sb->bucket_size = 128;
|
||||
sb->block_size = 4;
|
||||
|
||||
c = bch_cache_set_alloc(sb);
|
||||
if (!c)
|
||||
return -ENOMEM;
|
||||
|
||||
for (j = 0; j < 3; j++) {
|
||||
BUG_ON(list_empty(&c->btree_cache));
|
||||
all[j] = list_first_entry(&c->btree_cache, struct btree, list);
|
||||
list_del_init(&all[j]->list);
|
||||
|
||||
all[j]->key = KEY(0, 0, c->sb.bucket_size);
|
||||
bkey_copy_key(&all[j]->key, &MAX_KEY);
|
||||
}
|
||||
|
||||
b = all[0];
|
||||
fill = all[1];
|
||||
orig = all[2];
|
||||
|
||||
while (1) {
|
||||
for (j = 0; j < 3; j++)
|
||||
all[j]->written = all[j]->nsets = 0;
|
||||
|
||||
bch_bset_init_next(b);
|
||||
|
||||
while (1) {
|
||||
struct bset *i = write_block(b);
|
||||
struct bkey *k = op.keys.top;
|
||||
unsigned rand;
|
||||
|
||||
bkey_init(k);
|
||||
rand = get_random_int();
|
||||
|
||||
op.type = rand & 1
|
||||
? BTREE_INSERT
|
||||
: BTREE_REPLACE;
|
||||
rand >>= 1;
|
||||
|
||||
SET_KEY_SIZE(k, bucket_remainder(c, rand));
|
||||
rand >>= c->bucket_bits;
|
||||
rand &= 1024 * 512 - 1;
|
||||
rand += c->sb.bucket_size;
|
||||
SET_KEY_OFFSET(k, rand);
|
||||
#if 0
|
||||
SET_KEY_PTRS(k, 1);
|
||||
#endif
|
||||
bch_keylist_push(&op.keys);
|
||||
bch_btree_insert_keys(b, &op);
|
||||
|
||||
if (should_split(b) ||
|
||||
set_blocks(i, b->c) !=
|
||||
__set_blocks(i, i->keys + 15, b->c)) {
|
||||
i->csum = csum_set(i);
|
||||
|
||||
memcpy(write_block(fill),
|
||||
i, set_bytes(i));
|
||||
|
||||
b->written += set_blocks(i, b->c);
|
||||
fill->written = b->written;
|
||||
if (b->written == btree_blocks(b))
|
||||
break;
|
||||
|
||||
bch_btree_sort_lazy(b);
|
||||
bch_bset_init_next(b);
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(orig->sets[0].data,
|
||||
fill->sets[0].data,
|
||||
btree_bytes(c));
|
||||
|
||||
bch_btree_sort(b);
|
||||
fill->written = 0;
|
||||
bch_btree_read_done(&fill->io.cl);
|
||||
|
||||
if (b->sets[0].data->keys != fill->sets[0].data->keys ||
|
||||
memcmp(b->sets[0].data->start,
|
||||
fill->sets[0].data->start,
|
||||
b->sets[0].data->keys * sizeof(uint64_t))) {
|
||||
struct bset *i = b->sets[0].data;
|
||||
struct bkey *k, *l;
|
||||
|
||||
for (k = i->start,
|
||||
l = fill->sets[0].data->start;
|
||||
k < end(i);
|
||||
k = bkey_next(k), l = bkey_next(l))
|
||||
if (bkey_cmp(k, l) ||
|
||||
KEY_SIZE(k) != KEY_SIZE(l))
|
||||
pr_err("key %zi differs: %s != %s",
|
||||
(uint64_t *) k - i->d,
|
||||
pkey(k), pkey(l));
|
||||
|
||||
for (j = 0; j < 3; j++) {
|
||||
pr_err("**** Set %i ****", j);
|
||||
dump(all[j]);
|
||||
}
|
||||
panic("\n");
|
||||
}
|
||||
|
||||
pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
|
||||
}
|
||||
}
|
||||
|
||||
kobj_attribute_write(fuzz, btree_fuzz);
|
||||
#endif
|
||||
|
||||
void bch_debug_exit(void)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(debug))
|
||||
@ -554,11 +421,6 @@ void bch_debug_exit(void)
|
||||
int __init bch_debug_init(struct kobject *kobj)
|
||||
{
|
||||
int ret = 0;
|
||||
#if 0
|
||||
ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
|
||||
if (ret)
|
||||
return ret;
|
||||
#endif
|
||||
|
||||
debug = debugfs_create_dir("bcache", NULL);
|
||||
return ret;
|
||||
|
@ -3,15 +3,8 @@
|
||||
|
||||
/* Btree/bkey debug printing */
|
||||
|
||||
#define KEYHACK_SIZE 80
|
||||
struct keyprint_hack {
|
||||
char s[KEYHACK_SIZE];
|
||||
};
|
||||
|
||||
struct keyprint_hack bch_pkey(const struct bkey *k);
|
||||
struct keyprint_hack bch_pbtree(const struct btree *b);
|
||||
#define pkey(k) (&bch_pkey(k).s[0])
|
||||
#define pbtree(b) (&bch_pbtree(b).s[0])
|
||||
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
|
||||
int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
|
||||
|
||||
#ifdef CONFIG_BCACHE_EDEBUG
|
||||
|
||||
|
@ -9,6 +9,8 @@
|
||||
#include "bset.h"
|
||||
#include "debug.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
|
||||
static void bch_bi_idx_hack_endio(struct bio *bio, int error)
|
||||
{
|
||||
struct bio *p = bio->bi_private;
|
||||
@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
|
||||
* The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
|
||||
* bvec boundry; it is the caller's responsibility to ensure that @bio is not
|
||||
* freed before the split.
|
||||
*
|
||||
* If bch_bio_split() is running under generic_make_request(), it's not safe to
|
||||
* allocate more than one bio from the same bio set. Therefore, if it is running
|
||||
* under generic_make_request() it masks out __GFP_WAIT when doing the
|
||||
* allocation. The caller must check for failure if there's any possibility of
|
||||
* it being called from under generic_make_request(); it is then the caller's
|
||||
* responsibility to retry from a safe context (by e.g. punting to workqueue).
|
||||
*/
|
||||
struct bio *bch_bio_split(struct bio *bio, int sectors,
|
||||
gfp_t gfp, struct bio_set *bs)
|
||||
@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
|
||||
|
||||
BUG_ON(sectors <= 0);
|
||||
|
||||
/*
|
||||
* If we're being called from underneath generic_make_request() and we
|
||||
* already allocated any bios from this bio set, we risk deadlock if we
|
||||
* use the mempool. So instead, we possibly fail and let the caller punt
|
||||
* to workqueue or somesuch and retry in a safe context.
|
||||
*/
|
||||
if (current->bio_list)
|
||||
gfp &= ~__GFP_WAIT;
|
||||
|
||||
if (sectors >= bio_sectors(bio))
|
||||
return bio;
|
||||
|
||||
if (bio->bi_rw & REQ_DISCARD) {
|
||||
ret = bio_alloc_bioset(gfp, 1, bs);
|
||||
if (!ret)
|
||||
return NULL;
|
||||
idx = 0;
|
||||
goto out;
|
||||
}
|
||||
@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
|
||||
queue_max_segments(q));
|
||||
struct bio_vec *bv, *end = bio_iovec(bio) +
|
||||
min_t(int, bio_segments(bio), max_segments);
|
||||
|
||||
if (bio->bi_rw & REQ_DISCARD)
|
||||
return min(ret, q->limits.max_discard_sectors);
|
||||
|
||||
if (bio_segments(bio) > max_segments ||
|
||||
q->merge_bvec_fn) {
|
||||
struct bio_vec *bv;
|
||||
int i, seg = 0;
|
||||
|
||||
ret = 0;
|
||||
|
||||
for (bv = bio_iovec(bio); bv < end; bv++) {
|
||||
bio_for_each_segment(bv, bio, i) {
|
||||
struct bvec_merge_data bvm = {
|
||||
.bi_bdev = bio->bi_bdev,
|
||||
.bi_sector = bio->bi_sector,
|
||||
@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
|
||||
.bi_rw = bio->bi_rw,
|
||||
};
|
||||
|
||||
if (seg == max_segments)
|
||||
break;
|
||||
|
||||
if (q->merge_bvec_fn &&
|
||||
q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
|
||||
break;
|
||||
|
||||
seg++;
|
||||
ret += bv->bv_len >> 9;
|
||||
}
|
||||
}
|
||||
@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
|
||||
closure_put(cl);
|
||||
}
|
||||
|
||||
static void __bch_bio_submit_split(struct closure *cl)
|
||||
{
|
||||
struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
|
||||
struct bio *bio = s->bio, *n;
|
||||
|
||||
do {
|
||||
n = bch_bio_split(bio, bch_bio_max_sectors(bio),
|
||||
GFP_NOIO, s->p->bio_split);
|
||||
if (!n)
|
||||
continue_at(cl, __bch_bio_submit_split, system_wq);
|
||||
|
||||
n->bi_end_io = bch_bio_submit_split_endio;
|
||||
n->bi_private = cl;
|
||||
|
||||
closure_get(cl);
|
||||
bch_generic_make_request_hack(n);
|
||||
} while (n != bio);
|
||||
|
||||
continue_at(cl, bch_bio_submit_split_done, NULL);
|
||||
}
|
||||
|
||||
void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
|
||||
{
|
||||
struct bio_split_hook *s;
|
||||
struct bio *n;
|
||||
|
||||
if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
|
||||
goto submit;
|
||||
@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
|
||||
goto submit;
|
||||
|
||||
s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
|
||||
closure_init(&s->cl, NULL);
|
||||
|
||||
s->bio = bio;
|
||||
s->p = p;
|
||||
@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
|
||||
s->bi_private = bio->bi_private;
|
||||
bio_get(bio);
|
||||
|
||||
closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
|
||||
return;
|
||||
do {
|
||||
n = bch_bio_split(bio, bch_bio_max_sectors(bio),
|
||||
GFP_NOIO, s->p->bio_split);
|
||||
|
||||
n->bi_end_io = bch_bio_submit_split_endio;
|
||||
n->bi_private = &s->cl;
|
||||
|
||||
closure_get(&s->cl);
|
||||
bch_generic_make_request_hack(n);
|
||||
} while (n != bio);
|
||||
|
||||
continue_at(&s->cl, bch_bio_submit_split_done, NULL);
|
||||
submit:
|
||||
bch_generic_make_request_hack(bio);
|
||||
}
|
||||
|
@ -9,6 +9,8 @@
|
||||
#include "debug.h"
|
||||
#include "request.h"
|
||||
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
/*
|
||||
* Journal replay/recovery:
|
||||
*
|
||||
@ -182,9 +184,14 @@ bsearch:
|
||||
pr_debug("starting binary search, l %u r %u", l, r);
|
||||
|
||||
while (l + 1 < r) {
|
||||
m = (l + r) >> 1;
|
||||
seq = list_entry(list->prev, struct journal_replay,
|
||||
list)->j.seq;
|
||||
|
||||
if (read_bucket(m))
|
||||
m = (l + r) >> 1;
|
||||
read_bucket(m);
|
||||
|
||||
if (seq != list_entry(list->prev, struct journal_replay,
|
||||
list)->j.seq)
|
||||
l = m;
|
||||
else
|
||||
r = m;
|
||||
@ -300,7 +307,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
|
||||
for (k = i->j.start;
|
||||
k < end(&i->j);
|
||||
k = bkey_next(k)) {
|
||||
pr_debug("%s", pkey(k));
|
||||
trace_bcache_journal_replay_key(k);
|
||||
|
||||
bkey_copy(op->keys.top, k);
|
||||
bch_keylist_push(&op->keys);
|
||||
|
||||
@ -384,7 +392,7 @@ out:
|
||||
return;
|
||||
found:
|
||||
if (btree_node_dirty(best))
|
||||
bch_btree_write(best, true, NULL);
|
||||
bch_btree_node_write(best, NULL);
|
||||
rw_unlock(true, best);
|
||||
}
|
||||
|
||||
@ -617,7 +625,7 @@ static void journal_write_unlocked(struct closure *cl)
|
||||
bio_reset(bio);
|
||||
bio->bi_sector = PTR_OFFSET(k, i);
|
||||
bio->bi_bdev = ca->bdev;
|
||||
bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
|
||||
bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
|
||||
bio->bi_size = sectors << 9;
|
||||
|
||||
bio->bi_end_io = journal_write_endio;
|
||||
@ -712,7 +720,8 @@ void bch_journal(struct closure *cl)
|
||||
spin_lock(&c->journal.lock);
|
||||
|
||||
if (journal_full(&c->journal)) {
|
||||
/* XXX: tracepoint */
|
||||
trace_bcache_journal_full(c);
|
||||
|
||||
closure_wait(&c->journal.wait, cl);
|
||||
|
||||
journal_reclaim(c);
|
||||
@ -728,13 +737,15 @@ void bch_journal(struct closure *cl)
|
||||
|
||||
if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
|
||||
b > c->journal.blocks_free) {
|
||||
/* XXX: If we were inserting so many keys that they won't fit in
|
||||
trace_bcache_journal_entry_full(c);
|
||||
|
||||
/*
|
||||
* XXX: If we were inserting so many keys that they won't fit in
|
||||
* an _empty_ journal write, we'll deadlock. For now, handle
|
||||
* this in bch_keylist_realloc() - but something to think about.
|
||||
*/
|
||||
BUG_ON(!w->data->keys);
|
||||
|
||||
/* XXX: tracepoint */
|
||||
BUG_ON(!closure_wait(&w->wait, cl));
|
||||
|
||||
closure_flush(&c->journal.io);
|
||||
|
@ -9,6 +9,8 @@
|
||||
#include "debug.h"
|
||||
#include "request.h"
|
||||
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
struct moving_io {
|
||||
struct keybuf_key *w;
|
||||
struct search s;
|
||||
@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, s.cl);
|
||||
struct bio *bio = &io->bio.bio;
|
||||
struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
|
||||
while (bv-- != bio->bi_io_vec)
|
||||
bio_for_each_segment_all(bv, bio, i)
|
||||
__free_page(bv->bv_page);
|
||||
|
||||
pr_debug("%s %s", io->s.op.insert_collision
|
||||
? "collision moving" : "moved",
|
||||
pkey(&io->w->key));
|
||||
if (io->s.op.insert_collision)
|
||||
trace_bcache_gc_copy_collision(&io->w->key);
|
||||
|
||||
bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
|
||||
|
||||
@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
|
||||
struct moving_io *io = container_of(s, struct moving_io, s);
|
||||
|
||||
if (!s->error) {
|
||||
trace_bcache_write_moving(&io->bio.bio);
|
||||
|
||||
moving_init(io);
|
||||
|
||||
io->bio.bio.bi_sector = KEY_START(&io->w->key);
|
||||
@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
|
||||
struct moving_io *io = container_of(s, struct moving_io, s);
|
||||
struct bio *bio = &io->bio.bio;
|
||||
|
||||
trace_bcache_read_moving(bio);
|
||||
bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
|
||||
|
||||
continue_at(cl, write_moving, bch_gc_wq);
|
||||
@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
|
||||
/* XXX: if we error, background writeback could stall indefinitely */
|
||||
|
||||
while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
|
||||
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
|
||||
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
|
||||
&MAX_KEY, moving_pred);
|
||||
if (!w)
|
||||
break;
|
||||
|
||||
@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
|
||||
bio->bi_rw = READ;
|
||||
bio->bi_end_io = read_moving_endio;
|
||||
|
||||
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
|
||||
if (bio_alloc_pages(bio, GFP_KERNEL))
|
||||
goto err;
|
||||
|
||||
pr_debug("%s", pkey(&w->key));
|
||||
trace_bcache_gc_copy(&w->key);
|
||||
|
||||
closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
|
||||
|
||||
@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
|
||||
|
||||
void bch_moving_init_cache_set(struct cache_set *c)
|
||||
{
|
||||
bch_keybuf_init(&c->moving_gc_keys, moving_pred);
|
||||
bch_keybuf_init(&c->moving_gc_keys);
|
||||
}
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/module.h>
|
||||
@ -21,8 +22,6 @@
|
||||
|
||||
#define CUTOFF_CACHE_ADD 95
|
||||
#define CUTOFF_CACHE_READA 90
|
||||
#define CUTOFF_WRITEBACK 50
|
||||
#define CUTOFF_WRITEBACK_SYNC 75
|
||||
|
||||
struct kmem_cache *bch_search_cache;
|
||||
|
||||
@ -489,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl)
|
||||
bch_queue_gc(op->c);
|
||||
}
|
||||
|
||||
/*
|
||||
* Journal writes are marked REQ_FLUSH; if the original write was a
|
||||
* flush, it'll wait on the journal write.
|
||||
*/
|
||||
bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA);
|
||||
|
||||
do {
|
||||
unsigned i;
|
||||
struct bkey *k;
|
||||
@ -510,10 +515,6 @@ static void bch_insert_data_loop(struct closure *cl)
|
||||
goto err;
|
||||
|
||||
n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
|
||||
if (!n) {
|
||||
__bkey_put(op->c, k);
|
||||
continue_at(cl, bch_insert_data_loop, bcache_wq);
|
||||
}
|
||||
|
||||
n->bi_end_io = bch_insert_data_endio;
|
||||
n->bi_private = cl;
|
||||
@ -530,10 +531,9 @@ static void bch_insert_data_loop(struct closure *cl)
|
||||
if (KEY_CSUM(k))
|
||||
bio_csum(n, k);
|
||||
|
||||
pr_debug("%s", pkey(k));
|
||||
trace_bcache_cache_insert(k);
|
||||
bch_keylist_push(&op->keys);
|
||||
|
||||
trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
|
||||
n->bi_rw |= REQ_WRITE;
|
||||
bch_submit_bbio(n, op->c, k, 0);
|
||||
} while (n != bio);
|
||||
@ -716,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
|
||||
s->task = current;
|
||||
s->orig_bio = bio;
|
||||
s->write = (bio->bi_rw & REQ_WRITE) != 0;
|
||||
s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0;
|
||||
s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
|
||||
s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0;
|
||||
s->recoverable = 1;
|
||||
s->start_time = jiffies;
|
||||
@ -784,11 +784,8 @@ static void request_read_error(struct closure *cl)
|
||||
int i;
|
||||
|
||||
if (s->recoverable) {
|
||||
/* The cache read failed, but we can retry from the backing
|
||||
* device.
|
||||
*/
|
||||
pr_debug("recovering at sector %llu",
|
||||
(uint64_t) s->orig_bio->bi_sector);
|
||||
/* Retry from the backing device: */
|
||||
trace_bcache_read_retry(s->orig_bio);
|
||||
|
||||
s->error = 0;
|
||||
bv = s->bio.bio.bi_io_vec;
|
||||
@ -806,7 +803,6 @@ static void request_read_error(struct closure *cl)
|
||||
|
||||
/* XXX: invalidate cache */
|
||||
|
||||
trace_bcache_read_retry(&s->bio.bio);
|
||||
closure_bio_submit(&s->bio.bio, &s->cl, s->d);
|
||||
}
|
||||
|
||||
@ -827,53 +823,13 @@ static void request_read_done(struct closure *cl)
|
||||
*/
|
||||
|
||||
if (s->op.cache_bio) {
|
||||
struct bio_vec *src, *dst;
|
||||
unsigned src_offset, dst_offset, bytes;
|
||||
void *dst_ptr;
|
||||
|
||||
bio_reset(s->op.cache_bio);
|
||||
s->op.cache_bio->bi_sector = s->cache_miss->bi_sector;
|
||||
s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev;
|
||||
s->op.cache_bio->bi_size = s->cache_bio_sectors << 9;
|
||||
bch_bio_map(s->op.cache_bio, NULL);
|
||||
|
||||
src = bio_iovec(s->op.cache_bio);
|
||||
dst = bio_iovec(s->cache_miss);
|
||||
src_offset = src->bv_offset;
|
||||
dst_offset = dst->bv_offset;
|
||||
dst_ptr = kmap(dst->bv_page);
|
||||
|
||||
while (1) {
|
||||
if (dst_offset == dst->bv_offset + dst->bv_len) {
|
||||
kunmap(dst->bv_page);
|
||||
dst++;
|
||||
if (dst == bio_iovec_idx(s->cache_miss,
|
||||
s->cache_miss->bi_vcnt))
|
||||
break;
|
||||
|
||||
dst_offset = dst->bv_offset;
|
||||
dst_ptr = kmap(dst->bv_page);
|
||||
}
|
||||
|
||||
if (src_offset == src->bv_offset + src->bv_len) {
|
||||
src++;
|
||||
if (src == bio_iovec_idx(s->op.cache_bio,
|
||||
s->op.cache_bio->bi_vcnt))
|
||||
BUG();
|
||||
|
||||
src_offset = src->bv_offset;
|
||||
}
|
||||
|
||||
bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
|
||||
src->bv_offset + src->bv_len - src_offset);
|
||||
|
||||
memcpy(dst_ptr + dst_offset,
|
||||
page_address(src->bv_page) + src_offset,
|
||||
bytes);
|
||||
|
||||
src_offset += bytes;
|
||||
dst_offset += bytes;
|
||||
}
|
||||
bio_copy_data(s->cache_miss, s->op.cache_bio);
|
||||
|
||||
bio_put(s->cache_miss);
|
||||
s->cache_miss = NULL;
|
||||
@ -899,6 +855,7 @@ static void request_read_done_bh(struct closure *cl)
|
||||
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
|
||||
|
||||
bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
|
||||
trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
|
||||
|
||||
if (s->error)
|
||||
continue_at_nobarrier(cl, request_read_error, bcache_wq);
|
||||
@ -917,9 +874,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
||||
struct bio *miss;
|
||||
|
||||
miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
|
||||
if (!miss)
|
||||
return -EAGAIN;
|
||||
|
||||
if (miss == bio)
|
||||
s->op.lookup_done = true;
|
||||
|
||||
@ -938,8 +892,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
||||
reada = min(dc->readahead >> 9,
|
||||
sectors - bio_sectors(miss));
|
||||
|
||||
if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
|
||||
reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
|
||||
if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
|
||||
reada = bdev_sectors(miss->bi_bdev) -
|
||||
bio_end_sector(miss);
|
||||
}
|
||||
|
||||
s->cache_bio_sectors = bio_sectors(miss) + reada;
|
||||
@ -963,13 +918,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
|
||||
goto out_put;
|
||||
|
||||
bch_bio_map(s->op.cache_bio, NULL);
|
||||
if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
|
||||
if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
|
||||
goto out_put;
|
||||
|
||||
s->cache_miss = miss;
|
||||
bio_get(s->op.cache_bio);
|
||||
|
||||
trace_bcache_cache_miss(s->orig_bio);
|
||||
closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
|
||||
|
||||
return ret;
|
||||
@ -1002,24 +956,13 @@ static void cached_dev_write_complete(struct closure *cl)
|
||||
cached_dev_bio_complete(cl);
|
||||
}
|
||||
|
||||
static bool should_writeback(struct cached_dev *dc, struct bio *bio)
|
||||
{
|
||||
unsigned threshold = (bio->bi_rw & REQ_SYNC)
|
||||
? CUTOFF_WRITEBACK_SYNC
|
||||
: CUTOFF_WRITEBACK;
|
||||
|
||||
return !atomic_read(&dc->disk.detaching) &&
|
||||
cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
|
||||
dc->disk.c->gc_stats.in_use < threshold;
|
||||
}
|
||||
|
||||
static void request_write(struct cached_dev *dc, struct search *s)
|
||||
{
|
||||
struct closure *cl = &s->cl;
|
||||
struct bio *bio = &s->bio.bio;
|
||||
struct bkey start, end;
|
||||
start = KEY(dc->disk.id, bio->bi_sector, 0);
|
||||
end = KEY(dc->disk.id, bio_end(bio), 0);
|
||||
end = KEY(dc->disk.id, bio_end_sector(bio), 0);
|
||||
|
||||
bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
|
||||
|
||||
@ -1034,22 +977,37 @@ static void request_write(struct cached_dev *dc, struct search *s)
|
||||
if (bio->bi_rw & REQ_DISCARD)
|
||||
goto skip;
|
||||
|
||||
if (should_writeback(dc, s->orig_bio,
|
||||
cache_mode(dc, bio),
|
||||
s->op.skip)) {
|
||||
s->op.skip = false;
|
||||
s->writeback = true;
|
||||
}
|
||||
|
||||
if (s->op.skip)
|
||||
goto skip;
|
||||
|
||||
if (should_writeback(dc, s->orig_bio))
|
||||
s->writeback = true;
|
||||
trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
|
||||
|
||||
if (!s->writeback) {
|
||||
s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
|
||||
dc->disk.bio_split);
|
||||
|
||||
trace_bcache_writethrough(s->orig_bio);
|
||||
closure_bio_submit(bio, cl, s->d);
|
||||
} else {
|
||||
s->op.cache_bio = bio;
|
||||
trace_bcache_writeback(s->orig_bio);
|
||||
bch_writeback_add(dc, bio_sectors(bio));
|
||||
bch_writeback_add(dc);
|
||||
|
||||
if (s->op.flush_journal) {
|
||||
/* Also need to send a flush to the backing device */
|
||||
s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
|
||||
dc->disk.bio_split);
|
||||
|
||||
bio->bi_size = 0;
|
||||
bio->bi_vcnt = 0;
|
||||
closure_bio_submit(bio, cl, s->d);
|
||||
} else {
|
||||
s->op.cache_bio = bio;
|
||||
}
|
||||
}
|
||||
out:
|
||||
closure_call(&s->op.cl, bch_insert_data, NULL, cl);
|
||||
@ -1058,7 +1016,6 @@ skip:
|
||||
s->op.skip = true;
|
||||
s->op.cache_bio = s->orig_bio;
|
||||
bio_get(s->op.cache_bio);
|
||||
trace_bcache_write_skip(s->orig_bio);
|
||||
|
||||
if ((bio->bi_rw & REQ_DISCARD) &&
|
||||
!blk_queue_discard(bdev_get_queue(dc->bdev)))
|
||||
@ -1088,9 +1045,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
|
||||
|
||||
/* Cached devices - read & write stuff */
|
||||
|
||||
int bch_get_congested(struct cache_set *c)
|
||||
unsigned bch_get_congested(struct cache_set *c)
|
||||
{
|
||||
int i;
|
||||
long rand;
|
||||
|
||||
if (!c->congested_read_threshold_us &&
|
||||
!c->congested_write_threshold_us)
|
||||
@ -1106,7 +1064,13 @@ int bch_get_congested(struct cache_set *c)
|
||||
|
||||
i += CONGESTED_MAX;
|
||||
|
||||
return i <= 0 ? 1 : fract_exp_two(i, 6);
|
||||
if (i > 0)
|
||||
i = fract_exp_two(i, 6);
|
||||
|
||||
rand = get_random_int();
|
||||
i -= bitmap_weight(&rand, BITS_PER_LONG);
|
||||
|
||||
return i > 0 ? i : 1;
|
||||
}
|
||||
|
||||
static void add_sequential(struct task_struct *t)
|
||||
@ -1126,10 +1090,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
|
||||
{
|
||||
struct cache_set *c = s->op.c;
|
||||
struct bio *bio = &s->bio.bio;
|
||||
|
||||
long rand;
|
||||
int cutoff = bch_get_congested(c);
|
||||
unsigned mode = cache_mode(dc, bio);
|
||||
unsigned sectors, congested = bch_get_congested(c);
|
||||
|
||||
if (atomic_read(&dc->disk.detaching) ||
|
||||
c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
|
||||
@ -1147,17 +1109,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s)
|
||||
goto skip;
|
||||
}
|
||||
|
||||
if (!cutoff) {
|
||||
cutoff = dc->sequential_cutoff >> 9;
|
||||
if (!congested && !dc->sequential_cutoff)
|
||||
goto rescale;
|
||||
|
||||
if (!cutoff)
|
||||
goto rescale;
|
||||
|
||||
if (mode == CACHE_MODE_WRITEBACK &&
|
||||
(bio->bi_rw & REQ_WRITE) &&
|
||||
(bio->bi_rw & REQ_SYNC))
|
||||
goto rescale;
|
||||
}
|
||||
if (!congested &&
|
||||
mode == CACHE_MODE_WRITEBACK &&
|
||||
(bio->bi_rw & REQ_WRITE) &&
|
||||
(bio->bi_rw & REQ_SYNC))
|
||||
goto rescale;
|
||||
|
||||
if (dc->sequential_merge) {
|
||||
struct io *i;
|
||||
@ -1177,7 +1136,7 @@ found:
|
||||
if (i->sequential + bio->bi_size > i->sequential)
|
||||
i->sequential += bio->bi_size;
|
||||
|
||||
i->last = bio_end(bio);
|
||||
i->last = bio_end_sector(bio);
|
||||
i->jiffies = jiffies + msecs_to_jiffies(5000);
|
||||
s->task->sequential_io = i->sequential;
|
||||
|
||||
@ -1192,12 +1151,19 @@ found:
|
||||
add_sequential(s->task);
|
||||
}
|
||||
|
||||
rand = get_random_int();
|
||||
cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
|
||||
sectors = max(s->task->sequential_io,
|
||||
s->task->sequential_io_avg) >> 9;
|
||||
|
||||
if (cutoff <= (int) (max(s->task->sequential_io,
|
||||
s->task->sequential_io_avg) >> 9))
|
||||
if (dc->sequential_cutoff &&
|
||||
sectors >= dc->sequential_cutoff >> 9) {
|
||||
trace_bcache_bypass_sequential(s->orig_bio);
|
||||
goto skip;
|
||||
}
|
||||
|
||||
if (congested && sectors >= congested) {
|
||||
trace_bcache_bypass_congested(s->orig_bio);
|
||||
goto skip;
|
||||
}
|
||||
|
||||
rescale:
|
||||
bch_rescale_priorities(c, bio_sectors(bio));
|
||||
@ -1288,30 +1254,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
|
||||
static int flash_dev_cache_miss(struct btree *b, struct search *s,
|
||||
struct bio *bio, unsigned sectors)
|
||||
{
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
|
||||
/* Zero fill bio */
|
||||
|
||||
while (bio->bi_idx != bio->bi_vcnt) {
|
||||
struct bio_vec *bv = bio_iovec(bio);
|
||||
bio_for_each_segment(bv, bio, i) {
|
||||
unsigned j = min(bv->bv_len >> 9, sectors);
|
||||
|
||||
void *p = kmap(bv->bv_page);
|
||||
memset(p + bv->bv_offset, 0, j << 9);
|
||||
kunmap(bv->bv_page);
|
||||
|
||||
bv->bv_len -= j << 9;
|
||||
bv->bv_offset += j << 9;
|
||||
|
||||
if (bv->bv_len)
|
||||
return 0;
|
||||
|
||||
bio->bi_sector += j;
|
||||
bio->bi_size -= j << 9;
|
||||
|
||||
bio->bi_idx++;
|
||||
sectors -= j;
|
||||
sectors -= j;
|
||||
}
|
||||
|
||||
s->op.lookup_done = true;
|
||||
bio_advance(bio, min(sectors << 9, bio->bi_size));
|
||||
|
||||
if (!bio->bi_size)
|
||||
s->op.lookup_done = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1338,8 +1299,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
|
||||
closure_call(&s->op.cl, btree_read_async, NULL, cl);
|
||||
} else if (bio_has_data(bio) || s->op.skip) {
|
||||
bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
|
||||
&KEY(d->id, bio->bi_sector, 0),
|
||||
&KEY(d->id, bio_end(bio), 0));
|
||||
&KEY(d->id, bio->bi_sector, 0),
|
||||
&KEY(d->id, bio_end_sector(bio), 0));
|
||||
|
||||
s->writeback = true;
|
||||
s->op.cache_bio = bio;
|
||||
|
@ -30,7 +30,7 @@ struct search {
|
||||
};
|
||||
|
||||
void bch_cache_read_endio(struct bio *, int);
|
||||
int bch_get_congested(struct cache_set *);
|
||||
unsigned bch_get_congested(struct cache_set *);
|
||||
void bch_insert_data(struct closure *cl);
|
||||
void bch_btree_insert_async(struct closure *);
|
||||
void bch_cache_read_endio(struct bio *, int);
|
||||
|
@ -10,10 +10,13 @@
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/genhd.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/reboot.h>
|
||||
@ -342,6 +345,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
|
||||
struct closure *cl = &c->uuid_write.cl;
|
||||
struct uuid_entry *u;
|
||||
unsigned i;
|
||||
char buf[80];
|
||||
|
||||
BUG_ON(!parent);
|
||||
closure_lock(&c->uuid_write, parent);
|
||||
@ -362,8 +366,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw,
|
||||
break;
|
||||
}
|
||||
|
||||
pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
|
||||
pkey(&c->uuid_bucket));
|
||||
bch_bkey_to_text(buf, sizeof(buf), k);
|
||||
pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
|
||||
|
||||
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
|
||||
if (!bch_is_zero(u->uuid, 16))
|
||||
@ -543,7 +547,6 @@ void bch_prio_write(struct cache *ca)
|
||||
|
||||
pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
|
||||
fifo_used(&ca->free_inc), fifo_used(&ca->unused));
|
||||
blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
|
||||
|
||||
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
|
||||
long bucket;
|
||||
@ -704,7 +707,8 @@ static void bcache_device_detach(struct bcache_device *d)
|
||||
atomic_set(&d->detaching, 0);
|
||||
}
|
||||
|
||||
bcache_device_unlink(d);
|
||||
if (!d->flush_done)
|
||||
bcache_device_unlink(d);
|
||||
|
||||
d->c->devices[d->id] = NULL;
|
||||
closure_put(&d->c->caching);
|
||||
@ -743,13 +747,35 @@ static void bcache_device_free(struct bcache_device *d)
|
||||
mempool_destroy(d->unaligned_bvec);
|
||||
if (d->bio_split)
|
||||
bioset_free(d->bio_split);
|
||||
if (is_vmalloc_addr(d->stripe_sectors_dirty))
|
||||
vfree(d->stripe_sectors_dirty);
|
||||
else
|
||||
kfree(d->stripe_sectors_dirty);
|
||||
|
||||
closure_debug_destroy(&d->cl);
|
||||
}
|
||||
|
||||
static int bcache_device_init(struct bcache_device *d, unsigned block_size)
|
||||
static int bcache_device_init(struct bcache_device *d, unsigned block_size,
|
||||
sector_t sectors)
|
||||
{
|
||||
struct request_queue *q;
|
||||
size_t n;
|
||||
|
||||
if (!d->stripe_size_bits)
|
||||
d->stripe_size_bits = 31;
|
||||
|
||||
d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
|
||||
d->stripe_size_bits;
|
||||
|
||||
if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
|
||||
return -ENOMEM;
|
||||
|
||||
n = d->nr_stripes * sizeof(atomic_t);
|
||||
d->stripe_sectors_dirty = n < PAGE_SIZE << 6
|
||||
? kzalloc(n, GFP_KERNEL)
|
||||
: vzalloc(n);
|
||||
if (!d->stripe_sectors_dirty)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
|
||||
!(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
|
||||
@ -759,6 +785,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
|
||||
!(q = blk_alloc_queue(GFP_KERNEL)))
|
||||
return -ENOMEM;
|
||||
|
||||
set_capacity(d->disk, sectors);
|
||||
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
|
||||
|
||||
d->disk->major = bcache_major;
|
||||
@ -781,6 +808,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size)
|
||||
set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
|
||||
set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
|
||||
|
||||
blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -800,6 +829,17 @@ static void calc_cached_dev_sectors(struct cache_set *c)
|
||||
void bch_cached_dev_run(struct cached_dev *dc)
|
||||
{
|
||||
struct bcache_device *d = &dc->disk;
|
||||
char buf[SB_LABEL_SIZE + 1];
|
||||
char *env[] = {
|
||||
"DRIVER=bcache",
|
||||
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
|
||||
NULL,
|
||||
NULL,
|
||||
};
|
||||
|
||||
memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
|
||||
buf[SB_LABEL_SIZE] = '\0';
|
||||
env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
|
||||
|
||||
if (atomic_xchg(&dc->running, 1))
|
||||
return;
|
||||
@ -816,10 +856,12 @@ void bch_cached_dev_run(struct cached_dev *dc)
|
||||
|
||||
add_disk(d->disk);
|
||||
bd_link_disk_holder(dc->bdev, dc->disk.disk);
|
||||
#if 0
|
||||
char *env[] = { "SYMLINK=label" , NULL };
|
||||
/* won't show up in the uevent file, use udevadm monitor -e instead
|
||||
* only class / kset properties are persistent */
|
||||
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
|
||||
#endif
|
||||
kfree(env[1]);
|
||||
kfree(env[2]);
|
||||
|
||||
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
|
||||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
|
||||
pr_debug("error creating sysfs link");
|
||||
@ -960,6 +1002,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
|
||||
atomic_set(&dc->count, 1);
|
||||
|
||||
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
|
||||
bch_sectors_dirty_init(dc);
|
||||
atomic_set(&dc->has_dirty, 1);
|
||||
atomic_inc(&dc->count);
|
||||
bch_writeback_queue(dc);
|
||||
@ -1014,6 +1057,14 @@ static void cached_dev_flush(struct closure *cl)
|
||||
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
|
||||
struct bcache_device *d = &dc->disk;
|
||||
|
||||
mutex_lock(&bch_register_lock);
|
||||
d->flush_done = 1;
|
||||
|
||||
if (d->c)
|
||||
bcache_device_unlink(d);
|
||||
|
||||
mutex_unlock(&bch_register_lock);
|
||||
|
||||
bch_cache_accounting_destroy(&dc->accounting);
|
||||
kobject_del(&d->kobj);
|
||||
|
||||
@ -1045,7 +1096,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
|
||||
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
|
||||
}
|
||||
|
||||
ret = bcache_device_init(&dc->disk, block_size);
|
||||
ret = bcache_device_init(&dc->disk, block_size,
|
||||
dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -1144,11 +1196,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
|
||||
|
||||
kobject_init(&d->kobj, &bch_flash_dev_ktype);
|
||||
|
||||
if (bcache_device_init(d, block_bytes(c)))
|
||||
if (bcache_device_init(d, block_bytes(c), u->sectors))
|
||||
goto err;
|
||||
|
||||
bcache_device_attach(d, c, u - c->uuids);
|
||||
set_capacity(d->disk, u->sectors);
|
||||
bch_flash_dev_request_init(d);
|
||||
add_disk(d->disk);
|
||||
|
||||
@ -1255,9 +1306,10 @@ static void cache_set_free(struct closure *cl)
|
||||
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
|
||||
free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
|
||||
|
||||
kfree(c->fill_iter);
|
||||
if (c->bio_split)
|
||||
bioset_free(c->bio_split);
|
||||
if (c->fill_iter)
|
||||
mempool_destroy(c->fill_iter);
|
||||
if (c->bio_meta)
|
||||
mempool_destroy(c->bio_meta);
|
||||
if (c->search)
|
||||
@ -1278,11 +1330,9 @@ static void cache_set_free(struct closure *cl)
|
||||
static void cache_set_flush(struct closure *cl)
|
||||
{
|
||||
struct cache_set *c = container_of(cl, struct cache_set, caching);
|
||||
struct cache *ca;
|
||||
struct btree *b;
|
||||
|
||||
/* Shut down allocator threads */
|
||||
set_bit(CACHE_SET_STOPPING_2, &c->flags);
|
||||
wake_up(&c->alloc_wait);
|
||||
unsigned i;
|
||||
|
||||
bch_cache_accounting_destroy(&c->accounting);
|
||||
|
||||
@ -1295,7 +1345,11 @@ static void cache_set_flush(struct closure *cl)
|
||||
/* Should skip this if we're unregistering because of an error */
|
||||
list_for_each_entry(b, &c->btree_cache, list)
|
||||
if (btree_node_dirty(b))
|
||||
bch_btree_write(b, true, NULL);
|
||||
bch_btree_node_write(b, NULL);
|
||||
|
||||
for_each_cache(ca, c, i)
|
||||
if (ca->alloc_thread)
|
||||
kthread_stop(ca->alloc_thread);
|
||||
|
||||
closure_return(cl);
|
||||
}
|
||||
@ -1303,18 +1357,22 @@ static void cache_set_flush(struct closure *cl)
|
||||
static void __cache_set_unregister(struct closure *cl)
|
||||
{
|
||||
struct cache_set *c = container_of(cl, struct cache_set, caching);
|
||||
struct cached_dev *dc, *t;
|
||||
struct cached_dev *dc;
|
||||
size_t i;
|
||||
|
||||
mutex_lock(&bch_register_lock);
|
||||
|
||||
if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
|
||||
list_for_each_entry_safe(dc, t, &c->cached_devs, list)
|
||||
bch_cached_dev_detach(dc);
|
||||
|
||||
for (i = 0; i < c->nr_uuids; i++)
|
||||
if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
|
||||
bcache_device_stop(c->devices[i]);
|
||||
if (c->devices[i]) {
|
||||
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
|
||||
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
|
||||
dc = container_of(c->devices[i],
|
||||
struct cached_dev, disk);
|
||||
bch_cached_dev_detach(dc);
|
||||
} else {
|
||||
bcache_device_stop(c->devices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&bch_register_lock);
|
||||
|
||||
@ -1373,9 +1431,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
c->btree_pages = max_t(int, c->btree_pages / 4,
|
||||
BTREE_MAX_PAGES);
|
||||
|
||||
init_waitqueue_head(&c->alloc_wait);
|
||||
c->sort_crit_factor = int_sqrt(c->btree_pages);
|
||||
|
||||
mutex_init(&c->bucket_lock);
|
||||
mutex_init(&c->fill_lock);
|
||||
mutex_init(&c->sort_lock);
|
||||
spin_lock_init(&c->sort_time_lock);
|
||||
closure_init_unlocked(&c->sb_write);
|
||||
@ -1401,8 +1459,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
!(c->bio_meta = mempool_create_kmalloc_pool(2,
|
||||
sizeof(struct bbio) + sizeof(struct bio_vec) *
|
||||
bucket_pages(c))) ||
|
||||
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
|
||||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
|
||||
!(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
|
||||
!(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
|
||||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
|
||||
bch_journal_alloc(c) ||
|
||||
@ -1410,8 +1468,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
bch_open_buckets_alloc(c))
|
||||
goto err;
|
||||
|
||||
c->fill_iter->size = sb->bucket_size / sb->block_size;
|
||||
|
||||
c->congested_read_threshold_us = 2000;
|
||||
c->congested_write_threshold_us = 20000;
|
||||
c->error_limit = 8 << IO_ERROR_SHIFT;
|
||||
@ -1496,9 +1552,10 @@ static void run_cache_set(struct cache_set *c)
|
||||
*/
|
||||
bch_journal_next(&c->journal);
|
||||
|
||||
err = "error starting allocator thread";
|
||||
for_each_cache(ca, c, i)
|
||||
closure_call(&ca->alloc, bch_allocator_thread,
|
||||
system_wq, &c->cl);
|
||||
if (bch_cache_allocator_start(ca))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* First place it's safe to allocate: btree_check() and
|
||||
@ -1531,17 +1588,16 @@ static void run_cache_set(struct cache_set *c)
|
||||
|
||||
bch_btree_gc_finish(c);
|
||||
|
||||
err = "error starting allocator thread";
|
||||
for_each_cache(ca, c, i)
|
||||
closure_call(&ca->alloc, bch_allocator_thread,
|
||||
ca->alloc_workqueue, &c->cl);
|
||||
if (bch_cache_allocator_start(ca))
|
||||
goto err;
|
||||
|
||||
mutex_lock(&c->bucket_lock);
|
||||
for_each_cache(ca, c, i)
|
||||
bch_prio_write(ca);
|
||||
mutex_unlock(&c->bucket_lock);
|
||||
|
||||
wake_up(&c->alloc_wait);
|
||||
|
||||
err = "cannot allocate new UUID bucket";
|
||||
if (__uuid_write(c))
|
||||
goto err_unlock_gc;
|
||||
@ -1552,7 +1608,7 @@ static void run_cache_set(struct cache_set *c)
|
||||
goto err_unlock_gc;
|
||||
|
||||
bkey_copy_key(&c->root->key, &MAX_KEY);
|
||||
bch_btree_write(c->root, true, &op);
|
||||
bch_btree_node_write(c->root, &op.cl);
|
||||
|
||||
bch_btree_set_root(c->root);
|
||||
rw_unlock(true, c->root);
|
||||
@ -1673,9 +1729,6 @@ void bch_cache_release(struct kobject *kobj)
|
||||
|
||||
bio_split_pool_free(&ca->bio_split_hook);
|
||||
|
||||
if (ca->alloc_workqueue)
|
||||
destroy_workqueue(ca->alloc_workqueue);
|
||||
|
||||
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
|
||||
kfree(ca->prio_buckets);
|
||||
vfree(ca->buckets);
|
||||
@ -1723,7 +1776,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
|
||||
!(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
|
||||
2, GFP_KERNEL)) ||
|
||||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
|
||||
!(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
|
||||
bio_split_pool_init(&ca->bio_split_hook))
|
||||
return -ENOMEM;
|
||||
|
||||
@ -1786,6 +1838,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
|
||||
kobj_attribute_write(register, register_bcache);
|
||||
kobj_attribute_write(register_quiet, register_bcache);
|
||||
|
||||
static bool bch_is_open_backing(struct block_device *bdev) {
|
||||
struct cache_set *c, *tc;
|
||||
struct cached_dev *dc, *t;
|
||||
|
||||
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
|
||||
list_for_each_entry_safe(dc, t, &c->cached_devs, list)
|
||||
if (dc->bdev == bdev)
|
||||
return true;
|
||||
list_for_each_entry_safe(dc, t, &uncached_devices, list)
|
||||
if (dc->bdev == bdev)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool bch_is_open_cache(struct block_device *bdev) {
|
||||
struct cache_set *c, *tc;
|
||||
struct cache *ca;
|
||||
unsigned i;
|
||||
|
||||
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
|
||||
for_each_cache(ca, c, i)
|
||||
if (ca->bdev == bdev)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool bch_is_open(struct block_device *bdev) {
|
||||
return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
|
||||
}
|
||||
|
||||
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
|
||||
const char *buffer, size_t size)
|
||||
{
|
||||
@ -1810,8 +1892,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
|
||||
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
|
||||
sb);
|
||||
if (IS_ERR(bdev)) {
|
||||
if (bdev == ERR_PTR(-EBUSY))
|
||||
err = "device busy";
|
||||
if (bdev == ERR_PTR(-EBUSY)) {
|
||||
bdev = lookup_bdev(strim(path));
|
||||
if (!IS_ERR(bdev) && bch_is_open(bdev))
|
||||
err = "device already registered";
|
||||
else
|
||||
err = "device busy";
|
||||
}
|
||||
goto err;
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,9 @@
|
||||
#include "sysfs.h"
|
||||
#include "btree.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
static const char * const cache_replacement_policies[] = {
|
||||
@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
|
||||
rw_attribute(writeback_rate_d_smooth);
|
||||
read_attribute(writeback_rate_debug);
|
||||
|
||||
read_attribute(stripe_size);
|
||||
read_attribute(partial_stripes_expensive);
|
||||
|
||||
rw_attribute(synchronous);
|
||||
rw_attribute(journal_delay_ms);
|
||||
rw_attribute(discard);
|
||||
@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
|
||||
char derivative[20];
|
||||
char target[20];
|
||||
bch_hprint(dirty,
|
||||
atomic_long_read(&dc->disk.sectors_dirty) << 9);
|
||||
bcache_dev_sectors_dirty(&dc->disk) << 9);
|
||||
bch_hprint(derivative, dc->writeback_rate_derivative << 9);
|
||||
bch_hprint(target, dc->writeback_rate_target << 9);
|
||||
|
||||
@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
|
||||
}
|
||||
|
||||
sysfs_hprint(dirty_data,
|
||||
atomic_long_read(&dc->disk.sectors_dirty) << 9);
|
||||
bcache_dev_sectors_dirty(&dc->disk) << 9);
|
||||
|
||||
sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
|
||||
var_printf(partial_stripes_expensive, "%u");
|
||||
|
||||
var_printf(sequential_merge, "%i");
|
||||
var_hprint(sequential_cutoff);
|
||||
@ -170,6 +178,7 @@ STORE(__cached_dev)
|
||||
disk.kobj);
|
||||
unsigned v = size;
|
||||
struct cache_set *c;
|
||||
struct kobj_uevent_env *env;
|
||||
|
||||
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
|
||||
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
|
||||
@ -214,6 +223,7 @@ STORE(__cached_dev)
|
||||
}
|
||||
|
||||
if (attr == &sysfs_label) {
|
||||
/* note: endlines are preserved */
|
||||
memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
|
||||
bch_write_bdev_super(dc, NULL);
|
||||
if (dc->disk.c) {
|
||||
@ -221,6 +231,15 @@ STORE(__cached_dev)
|
||||
buf, SB_LABEL_SIZE);
|
||||
bch_uuid_write(dc->disk.c);
|
||||
}
|
||||
env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
|
||||
if (!env)
|
||||
return -ENOMEM;
|
||||
add_uevent_var(env, "DRIVER=bcache");
|
||||
add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
|
||||
add_uevent_var(env, "CACHED_LABEL=%s", buf);
|
||||
kobject_uevent_env(
|
||||
&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
|
||||
kfree(env);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_attach) {
|
||||
@ -284,6 +303,8 @@ static struct attribute *bch_cached_dev_files[] = {
|
||||
&sysfs_writeback_rate_d_smooth,
|
||||
&sysfs_writeback_rate_debug,
|
||||
&sysfs_dirty_data,
|
||||
&sysfs_stripe_size,
|
||||
&sysfs_partial_stripes_expensive,
|
||||
&sysfs_sequential_cutoff,
|
||||
&sysfs_sequential_merge,
|
||||
&sysfs_clear_stats,
|
||||
@ -665,12 +686,10 @@ SHOW(__bch_cache)
|
||||
int cmp(const void *l, const void *r)
|
||||
{ return *((uint16_t *) r) - *((uint16_t *) l); }
|
||||
|
||||
/* Number of quantiles we compute */
|
||||
const unsigned nq = 31;
|
||||
|
||||
size_t n = ca->sb.nbuckets, i, unused, btree;
|
||||
uint64_t sum = 0;
|
||||
uint16_t q[nq], *p, *cached;
|
||||
/* Compute 31 quantiles */
|
||||
uint16_t q[31], *p, *cached;
|
||||
ssize_t ret;
|
||||
|
||||
cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
|
||||
@ -703,26 +722,29 @@ SHOW(__bch_cache)
|
||||
if (n)
|
||||
do_div(sum, n);
|
||||
|
||||
for (i = 0; i < nq; i++)
|
||||
q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
|
||||
for (i = 0; i < ARRAY_SIZE(q); i++)
|
||||
q[i] = INITIAL_PRIO - cached[n * (i + 1) /
|
||||
(ARRAY_SIZE(q) + 1)];
|
||||
|
||||
vfree(p);
|
||||
|
||||
ret = snprintf(buf, PAGE_SIZE,
|
||||
"Unused: %zu%%\n"
|
||||
"Metadata: %zu%%\n"
|
||||
"Average: %llu\n"
|
||||
"Sectors per Q: %zu\n"
|
||||
"Quantiles: [",
|
||||
unused * 100 / (size_t) ca->sb.nbuckets,
|
||||
btree * 100 / (size_t) ca->sb.nbuckets, sum,
|
||||
n * ca->sb.bucket_size / (nq + 1));
|
||||
ret = scnprintf(buf, PAGE_SIZE,
|
||||
"Unused: %zu%%\n"
|
||||
"Metadata: %zu%%\n"
|
||||
"Average: %llu\n"
|
||||
"Sectors per Q: %zu\n"
|
||||
"Quantiles: [",
|
||||
unused * 100 / (size_t) ca->sb.nbuckets,
|
||||
btree * 100 / (size_t) ca->sb.nbuckets, sum,
|
||||
n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
|
||||
|
||||
for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
|
||||
ret += snprintf(buf + ret, PAGE_SIZE - ret,
|
||||
i < nq - 1 ? "%u " : "%u]\n", q[i]);
|
||||
for (i = 0; i < ARRAY_SIZE(q); i++)
|
||||
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
|
||||
"%u ", q[i]);
|
||||
ret--;
|
||||
|
||||
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
|
||||
|
||||
buf[PAGE_SIZE - 1] = '\0';
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "btree.h"
|
||||
#include "request.h"
|
||||
|
||||
#include <linux/blktrace_api.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@ -9,18 +10,44 @@
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
|
||||
|
@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
|
||||
}
|
||||
}
|
||||
|
||||
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
|
||||
{
|
||||
int i;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bio_for_each_segment(bv, bio, i) {
|
||||
bv->bv_page = alloc_page(gfp);
|
||||
if (!bv->bv_page) {
|
||||
while (bv-- != bio->bi_io_vec + bio->bi_idx)
|
||||
__free_page(bv->bv_page);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
|
||||
* use permitted, subject to terms of PostgreSQL license; see.)
|
||||
|
@ -15,8 +15,6 @@
|
||||
|
||||
struct closure;
|
||||
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
#ifdef CONFIG_BCACHE_EDEBUG
|
||||
|
||||
#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
|
||||
@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
|
||||
return x;
|
||||
}
|
||||
|
||||
#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
|
||||
|
||||
void bch_bio_map(struct bio *bio, void *base);
|
||||
|
||||
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
|
||||
|
||||
static inline sector_t bdev_sectors(struct block_device *bdev)
|
||||
{
|
||||
return bdev->bd_inode->i_size >> 9;
|
||||
|
@ -9,6 +9,9 @@
|
||||
#include "bcache.h"
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "writeback.h"
|
||||
|
||||
#include <trace/events/bcache.h>
|
||||
|
||||
static struct workqueue_struct *dirty_wq;
|
||||
|
||||
@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
|
||||
|
||||
int change = 0;
|
||||
int64_t error;
|
||||
int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
|
||||
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
|
||||
int64_t derivative = dirty - dc->disk.sectors_dirty_last;
|
||||
|
||||
dc->disk.sectors_dirty_last = dirty;
|
||||
@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
|
||||
return KEY_DIRTY(k);
|
||||
}
|
||||
|
||||
static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
|
||||
{
|
||||
uint64_t stripe;
|
||||
unsigned nr_sectors = KEY_SIZE(k);
|
||||
struct cached_dev *dc = container_of(buf, struct cached_dev,
|
||||
writeback_keys);
|
||||
unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
|
||||
|
||||
if (!KEY_DIRTY(k))
|
||||
return false;
|
||||
|
||||
stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
|
||||
while (1) {
|
||||
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
|
||||
stripe_size)
|
||||
return false;
|
||||
|
||||
if (nr_sectors <= stripe_size)
|
||||
return true;
|
||||
|
||||
nr_sectors -= stripe_size;
|
||||
stripe++;
|
||||
}
|
||||
}
|
||||
|
||||
static void dirty_init(struct keybuf_key *w)
|
||||
{
|
||||
struct dirty_io *io = w->private;
|
||||
@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
|
||||
searched_from_start = true;
|
||||
}
|
||||
|
||||
bch_refill_keybuf(dc->disk.c, buf, &end);
|
||||
if (dc->partial_stripes_expensive) {
|
||||
uint64_t i;
|
||||
|
||||
for (i = 0; i < dc->disk.nr_stripes; i++)
|
||||
if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
|
||||
1 << dc->disk.stripe_size_bits)
|
||||
goto full_stripes;
|
||||
|
||||
goto normal_refill;
|
||||
full_stripes:
|
||||
bch_refill_keybuf(dc->disk.c, buf, &end,
|
||||
dirty_full_stripe_pred);
|
||||
} else {
|
||||
normal_refill:
|
||||
bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
|
||||
}
|
||||
|
||||
if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
|
||||
/* Searched the entire btree - delay awhile */
|
||||
@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
|
||||
}
|
||||
}
|
||||
|
||||
void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
|
||||
void bch_writeback_add(struct cached_dev *dc)
|
||||
{
|
||||
atomic_long_add(sectors, &dc->disk.sectors_dirty);
|
||||
|
||||
if (!atomic_read(&dc->has_dirty) &&
|
||||
!atomic_xchg(&dc->has_dirty, 1)) {
|
||||
atomic_inc(&dc->count);
|
||||
@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
|
||||
}
|
||||
}
|
||||
|
||||
void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
|
||||
uint64_t offset, int nr_sectors)
|
||||
{
|
||||
struct bcache_device *d = c->devices[inode];
|
||||
unsigned stripe_size, stripe_offset;
|
||||
uint64_t stripe;
|
||||
|
||||
if (!d)
|
||||
return;
|
||||
|
||||
stripe_size = 1 << d->stripe_size_bits;
|
||||
stripe = offset >> d->stripe_size_bits;
|
||||
stripe_offset = offset & (stripe_size - 1);
|
||||
|
||||
while (nr_sectors) {
|
||||
int s = min_t(unsigned, abs(nr_sectors),
|
||||
stripe_size - stripe_offset);
|
||||
|
||||
if (nr_sectors < 0)
|
||||
s = -s;
|
||||
|
||||
atomic_add(s, d->stripe_sectors_dirty + stripe);
|
||||
nr_sectors -= s;
|
||||
stripe_offset = 0;
|
||||
stripe++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Background writeback - IO loop */
|
||||
|
||||
static void dirty_io_destructor(struct closure *cl)
|
||||
@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
|
||||
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
|
||||
struct keybuf_key *w = io->bio.bi_private;
|
||||
struct cached_dev *dc = io->dc;
|
||||
struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
|
||||
struct bio_vec *bv;
|
||||
int i;
|
||||
|
||||
while (bv-- != io->bio.bi_io_vec)
|
||||
bio_for_each_segment_all(bv, &io->bio, i)
|
||||
__free_page(bv->bv_page);
|
||||
|
||||
/* This is kind of a dumb way of signalling errors. */
|
||||
@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
|
||||
for (i = 0; i < KEY_PTRS(&w->key); i++)
|
||||
atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
|
||||
|
||||
pr_debug("clearing %s", pkey(&w->key));
|
||||
bch_btree_insert(&op, dc->disk.c);
|
||||
closure_sync(&op.cl);
|
||||
|
||||
if (op.insert_collision)
|
||||
trace_bcache_writeback_collision(&w->key);
|
||||
|
||||
atomic_long_inc(op.insert_collision
|
||||
? &dc->disk.c->writeback_keys_failed
|
||||
: &dc->disk.c->writeback_keys_done);
|
||||
@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
|
||||
io->bio.bi_bdev = io->dc->bdev;
|
||||
io->bio.bi_end_io = dirty_endio;
|
||||
|
||||
trace_bcache_write_dirty(&io->bio);
|
||||
closure_bio_submit(&io->bio, cl, &io->dc->disk);
|
||||
|
||||
continue_at(cl, write_dirty_finish, dirty_wq);
|
||||
@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
|
||||
{
|
||||
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
|
||||
|
||||
trace_bcache_read_dirty(&io->bio);
|
||||
closure_bio_submit(&io->bio, cl, &io->dc->disk);
|
||||
|
||||
continue_at(cl, write_dirty, dirty_wq);
|
||||
@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
|
||||
io->bio.bi_rw = READ;
|
||||
io->bio.bi_end_io = read_dirty_endio;
|
||||
|
||||
if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
|
||||
if (bio_alloc_pages(&io->bio, GFP_KERNEL))
|
||||
goto err_free;
|
||||
|
||||
pr_debug("%s", pkey(&w->key));
|
||||
trace_bcache_writeback(&w->key);
|
||||
|
||||
closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
|
||||
|
||||
@ -375,12 +445,49 @@ err:
|
||||
refill_dirty(cl);
|
||||
}
|
||||
|
||||
/* Init */
|
||||
|
||||
static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
|
||||
struct cached_dev *dc)
|
||||
{
|
||||
struct bkey *k;
|
||||
struct btree_iter iter;
|
||||
|
||||
bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
|
||||
while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
|
||||
if (!b->level) {
|
||||
if (KEY_INODE(k) > dc->disk.id)
|
||||
break;
|
||||
|
||||
if (KEY_DIRTY(k))
|
||||
bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
|
||||
KEY_START(k),
|
||||
KEY_SIZE(k));
|
||||
} else {
|
||||
btree(sectors_dirty_init, k, b, op, dc);
|
||||
if (KEY_INODE(k) > dc->disk.id)
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch_sectors_dirty_init(struct cached_dev *dc)
|
||||
{
|
||||
struct btree_op op;
|
||||
|
||||
bch_btree_op_init_stack(&op);
|
||||
btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
|
||||
}
|
||||
|
||||
void bch_cached_dev_writeback_init(struct cached_dev *dc)
|
||||
{
|
||||
closure_init_unlocked(&dc->writeback);
|
||||
init_rwsem(&dc->writeback_lock);
|
||||
|
||||
bch_keybuf_init(&dc->writeback_keys, dirty_pred);
|
||||
bch_keybuf_init(&dc->writeback_keys);
|
||||
|
||||
dc->writeback_metadata = true;
|
||||
dc->writeback_running = true;
|
||||
|
64
drivers/md/bcache/writeback.h
Normal file
64
drivers/md/bcache/writeback.h
Normal file
@ -0,0 +1,64 @@
|
||||
#ifndef _BCACHE_WRITEBACK_H
|
||||
#define _BCACHE_WRITEBACK_H
|
||||
|
||||
#define CUTOFF_WRITEBACK 40
|
||||
#define CUTOFF_WRITEBACK_SYNC 70
|
||||
|
||||
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
|
||||
{
|
||||
uint64_t i, ret = 0;
|
||||
|
||||
for (i = 0; i < d->nr_stripes; i++)
|
||||
ret += atomic_read(d->stripe_sectors_dirty + i);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
|
||||
uint64_t offset,
|
||||
unsigned nr_sectors)
|
||||
{
|
||||
uint64_t stripe = offset >> d->stripe_size_bits;
|
||||
|
||||
while (1) {
|
||||
if (atomic_read(d->stripe_sectors_dirty + stripe))
|
||||
return true;
|
||||
|
||||
if (nr_sectors <= 1 << d->stripe_size_bits)
|
||||
return false;
|
||||
|
||||
nr_sectors -= 1 << d->stripe_size_bits;
|
||||
stripe++;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
|
||||
unsigned cache_mode, bool would_skip)
|
||||
{
|
||||
unsigned in_use = dc->disk.c->gc_stats.in_use;
|
||||
|
||||
if (cache_mode != CACHE_MODE_WRITEBACK ||
|
||||
atomic_read(&dc->disk.detaching) ||
|
||||
in_use > CUTOFF_WRITEBACK_SYNC)
|
||||
return false;
|
||||
|
||||
if (dc->partial_stripes_expensive &&
|
||||
bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
|
||||
bio_sectors(bio)))
|
||||
return true;
|
||||
|
||||
if (would_skip)
|
||||
return false;
|
||||
|
||||
return bio->bi_rw & REQ_SYNC ||
|
||||
in_use <= CUTOFF_WRITEBACK;
|
||||
}
|
||||
|
||||
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
|
||||
void bch_writeback_queue(struct cached_dev *);
|
||||
void bch_writeback_add(struct cached_dev *);
|
||||
|
||||
void bch_sectors_dirty_init(struct cached_dev *dc);
|
||||
void bch_cached_dev_writeback_init(struct cached_dev *);
|
||||
|
||||
#endif
|
@ -177,7 +177,11 @@ enum drbd_ret_code {
|
||||
ERR_NEED_APV_100 = 163,
|
||||
ERR_NEED_ALLOW_TWO_PRI = 164,
|
||||
ERR_MD_UNCLEAN = 165,
|
||||
|
||||
ERR_MD_LAYOUT_CONNECTED = 166,
|
||||
ERR_MD_LAYOUT_TOO_BIG = 167,
|
||||
ERR_MD_LAYOUT_TOO_SMALL = 168,
|
||||
ERR_MD_LAYOUT_NO_FIT = 169,
|
||||
ERR_IMPLICIT_SHRINK = 170,
|
||||
/* insert new ones above this line */
|
||||
AFTER_LAST_ERR_CODE
|
||||
};
|
||||
|
@ -181,6 +181,8 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
|
||||
__u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
|
||||
__flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
|
||||
__flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
|
||||
__u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF)
|
||||
__u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF)
|
||||
)
|
||||
|
||||
GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
|
||||
|
@ -215,4 +215,13 @@
|
||||
#define DRBD_ALWAYS_ASBP_DEF 0
|
||||
#define DRBD_USE_RLE_DEF 1
|
||||
|
||||
#define DRBD_AL_STRIPES_MIN 1
|
||||
#define DRBD_AL_STRIPES_MAX 1024
|
||||
#define DRBD_AL_STRIPES_DEF 1
|
||||
#define DRBD_AL_STRIPES_SCALE '1'
|
||||
|
||||
#define DRBD_AL_STRIPE_SIZE_MIN 4
|
||||
#define DRBD_AL_STRIPE_SIZE_MAX 16777216
|
||||
#define DRBD_AL_STRIPE_SIZE_DEF 32
|
||||
#define DRBD_AL_STRIPE_SIZE_SCALE 'k' /* kilobytes */
|
||||
#endif
|
||||
|
@ -9,9 +9,7 @@
|
||||
struct search;
|
||||
|
||||
DECLARE_EVENT_CLASS(bcache_request,
|
||||
|
||||
TP_PROTO(struct search *s, struct bio *bio),
|
||||
|
||||
TP_ARGS(s, bio),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request,
|
||||
__field(dev_t, orig_sector )
|
||||
__field(unsigned int, nr_sector )
|
||||
__array(char, rwbs, 6 )
|
||||
__array(char, comm, TASK_COMM_LEN )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request,
|
||||
__entry->orig_sector = bio->bi_sector - 16;
|
||||
__entry->nr_sector = bio->bi_size >> 9;
|
||||
blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
|
||||
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
|
||||
TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rwbs,
|
||||
(unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->comm,
|
||||
__entry->orig_major, __entry->orig_minor,
|
||||
__entry->rwbs, (unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->orig_major, __entry->orig_minor,
|
||||
(unsigned long long)__entry->orig_sector)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(bkey,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u32, size )
|
||||
__field(u32, inode )
|
||||
__field(u64, offset )
|
||||
__field(bool, dirty )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->inode = KEY_INODE(k);
|
||||
__entry->offset = KEY_OFFSET(k);
|
||||
__entry->size = KEY_SIZE(k);
|
||||
__entry->dirty = KEY_DIRTY(k);
|
||||
),
|
||||
|
||||
TP_printk("%u:%llu len %u dirty %u", __entry->inode,
|
||||
__entry->offset, __entry->size, __entry->dirty)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(btree_node,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(size_t, bucket )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
|
||||
),
|
||||
|
||||
TP_printk("bucket %zu", __entry->bucket)
|
||||
);
|
||||
|
||||
/* request.c */
|
||||
|
||||
DEFINE_EVENT(bcache_request, bcache_request_start,
|
||||
|
||||
TP_PROTO(struct search *s, struct bio *bio),
|
||||
|
||||
TP_ARGS(s, bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_request, bcache_request_end,
|
||||
|
||||
TP_PROTO(struct search *s, struct bio *bio),
|
||||
|
||||
TP_ARGS(s, bio)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(bcache_bio,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio,
|
||||
__field(sector_t, sector )
|
||||
__field(unsigned int, nr_sector )
|
||||
__array(char, rwbs, 6 )
|
||||
__array(char, comm, TASK_COMM_LEN )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -78,191 +104,328 @@ DECLARE_EVENT_CLASS(bcache_bio,
|
||||
__entry->sector = bio->bi_sector;
|
||||
__entry->nr_sector = bio->bi_size >> 9;
|
||||
blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
|
||||
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %llu + %u [%s]",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rwbs,
|
||||
(unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->comm)
|
||||
TP_printk("%d,%d %s %llu + %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
|
||||
(unsigned long long)__entry->sector, __entry->nr_sector)
|
||||
);
|
||||
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_passthrough,
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_bypass_sequential,
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_cache_hit,
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_bypass_congested,
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_cache_miss,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_read_retry,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_writethrough,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_writeback,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_write_skip,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_btree_read,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_btree_write,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_write_dirty,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_read_dirty,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_write_moving,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_read_moving,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_journal_write,
|
||||
|
||||
TP_PROTO(struct bio *bio),
|
||||
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(bcache_cache_bio,
|
||||
|
||||
TP_PROTO(struct bio *bio,
|
||||
sector_t orig_sector,
|
||||
struct block_device* orig_bdev),
|
||||
|
||||
TP_ARGS(bio, orig_sector, orig_bdev),
|
||||
TRACE_EVENT(bcache_read,
|
||||
TP_PROTO(struct bio *bio, bool hit, bool bypass),
|
||||
TP_ARGS(bio, hit, bypass),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev )
|
||||
__field(dev_t, orig_dev )
|
||||
__field(sector_t, sector )
|
||||
__field(sector_t, orig_sector )
|
||||
__field(unsigned int, nr_sector )
|
||||
__array(char, rwbs, 6 )
|
||||
__array(char, comm, TASK_COMM_LEN )
|
||||
__field(bool, cache_hit )
|
||||
__field(bool, bypass )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->dev = bio->bi_bdev->bd_dev;
|
||||
__entry->orig_dev = orig_bdev->bd_dev;
|
||||
__entry->sector = bio->bi_sector;
|
||||
__entry->orig_sector = orig_sector;
|
||||
__entry->nr_sector = bio->bi_size >> 9;
|
||||
blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
|
||||
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
|
||||
__entry->cache_hit = hit;
|
||||
__entry->bypass = bypass;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)",
|
||||
TP_printk("%d,%d %s %llu + %u hit %u bypass %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rwbs,
|
||||
(unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->comm,
|
||||
MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
|
||||
(unsigned long long)__entry->orig_sector)
|
||||
__entry->rwbs, (unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->cache_hit, __entry->bypass)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
|
||||
|
||||
TP_PROTO(struct bio *bio,
|
||||
sector_t orig_sector,
|
||||
struct block_device *orig_bdev),
|
||||
|
||||
TP_ARGS(bio, orig_sector, orig_bdev)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(bcache_gc,
|
||||
|
||||
TP_PROTO(uint8_t *uuid),
|
||||
|
||||
TP_ARGS(uuid),
|
||||
TRACE_EVENT(bcache_write,
|
||||
TP_PROTO(struct bio *bio, bool writeback, bool bypass),
|
||||
TP_ARGS(bio, writeback, bypass),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(uint8_t *, uuid)
|
||||
__field(dev_t, dev )
|
||||
__field(sector_t, sector )
|
||||
__field(unsigned int, nr_sector )
|
||||
__array(char, rwbs, 6 )
|
||||
__field(bool, writeback )
|
||||
__field(bool, bypass )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->uuid = uuid;
|
||||
__entry->dev = bio->bi_bdev->bd_dev;
|
||||
__entry->sector = bio->bi_sector;
|
||||
__entry->nr_sector = bio->bi_size >> 9;
|
||||
blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
|
||||
__entry->writeback = writeback;
|
||||
__entry->bypass = bypass;
|
||||
),
|
||||
|
||||
TP_printk("%d,%d %s %llu + %u hit %u bypass %u",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__entry->rwbs, (unsigned long long)__entry->sector,
|
||||
__entry->nr_sector, __entry->writeback, __entry->bypass)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_bio, bcache_read_retry,
|
||||
TP_PROTO(struct bio *bio),
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, bcache_cache_insert,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
/* Journal */
|
||||
|
||||
DECLARE_EVENT_CLASS(cache_set,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, uuid, 16 )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
memcpy(__entry->uuid, c->sb.set_uuid, 16);
|
||||
),
|
||||
|
||||
TP_printk("%pU", __entry->uuid)
|
||||
);
|
||||
|
||||
|
||||
DEFINE_EVENT(bcache_gc, bcache_gc_start,
|
||||
|
||||
TP_PROTO(uint8_t *uuid),
|
||||
|
||||
TP_ARGS(uuid)
|
||||
DEFINE_EVENT(bkey, bcache_journal_replay_key,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bcache_gc, bcache_gc_end,
|
||||
DEFINE_EVENT(cache_set, bcache_journal_full,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c)
|
||||
);
|
||||
|
||||
TP_PROTO(uint8_t *uuid),
|
||||
DEFINE_EVENT(cache_set, bcache_journal_entry_full,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c)
|
||||
);
|
||||
|
||||
TP_ARGS(uuid)
|
||||
DEFINE_EVENT(bcache_bio, bcache_journal_write,
|
||||
TP_PROTO(struct bio *bio),
|
||||
TP_ARGS(bio)
|
||||
);
|
||||
|
||||
/* Btree */
|
||||
|
||||
DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_node, bcache_btree_read,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b)
|
||||
);
|
||||
|
||||
TRACE_EVENT(bcache_btree_write,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(size_t, bucket )
|
||||
__field(unsigned, block )
|
||||
__field(unsigned, keys )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
|
||||
__entry->block = b->written;
|
||||
__entry->keys = b->sets[b->nsets].data->keys;
|
||||
),
|
||||
|
||||
TP_printk("bucket %zu", __entry->bucket)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_node, bcache_btree_node_alloc,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_node, bcache_btree_node_free,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b)
|
||||
);
|
||||
|
||||
TRACE_EVENT(bcache_btree_gc_coalesce,
|
||||
TP_PROTO(unsigned nodes),
|
||||
TP_ARGS(nodes),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned, nodes )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->nodes = nodes;
|
||||
),
|
||||
|
||||
TP_printk("coalesced %u nodes", __entry->nodes)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(cache_set, bcache_gc_start,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(cache_set, bcache_gc_end,
|
||||
TP_PROTO(struct cache_set *c),
|
||||
TP_ARGS(c)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, bcache_gc_copy,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, bcache_gc_copy_collision,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
TRACE_EVENT(bcache_btree_insert_key,
|
||||
TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status),
|
||||
TP_ARGS(b, k, op, status),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u64, btree_node )
|
||||
__field(u32, btree_level )
|
||||
__field(u32, inode )
|
||||
__field(u64, offset )
|
||||
__field(u32, size )
|
||||
__field(u8, dirty )
|
||||
__field(u8, op )
|
||||
__field(u8, status )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0);
|
||||
__entry->btree_level = b->level;
|
||||
__entry->inode = KEY_INODE(k);
|
||||
__entry->offset = KEY_OFFSET(k);
|
||||
__entry->size = KEY_SIZE(k);
|
||||
__entry->dirty = KEY_DIRTY(k);
|
||||
__entry->op = op;
|
||||
__entry->status = status;
|
||||
),
|
||||
|
||||
TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u",
|
||||
__entry->status, __entry->op,
|
||||
__entry->btree_node, __entry->btree_level,
|
||||
__entry->inode, __entry->offset,
|
||||
__entry->size, __entry->dirty)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(btree_split,
|
||||
TP_PROTO(struct btree *b, unsigned keys),
|
||||
TP_ARGS(b, keys),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(size_t, bucket )
|
||||
__field(unsigned, keys )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0);
|
||||
__entry->keys = keys;
|
||||
),
|
||||
|
||||
TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_split, bcache_btree_node_split,
|
||||
TP_PROTO(struct btree *b, unsigned keys),
|
||||
TP_ARGS(b, keys)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_split, bcache_btree_node_compact,
|
||||
TP_PROTO(struct btree *b, unsigned keys),
|
||||
TP_ARGS(b, keys)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_node, bcache_btree_set_root,
|
||||
TP_PROTO(struct btree *b),
|
||||
TP_ARGS(b)
|
||||
);
|
||||
|
||||
/* Allocator */
|
||||
|
||||
TRACE_EVENT(bcache_alloc_invalidate,
|
||||
TP_PROTO(struct cache *ca),
|
||||
TP_ARGS(ca),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned, free )
|
||||
__field(unsigned, free_inc )
|
||||
__field(unsigned, free_inc_size )
|
||||
__field(unsigned, unused )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->free = fifo_used(&ca->free);
|
||||
__entry->free_inc = fifo_used(&ca->free_inc);
|
||||
__entry->free_inc_size = ca->free_inc.size;
|
||||
__entry->unused = fifo_used(&ca->unused);
|
||||
),
|
||||
|
||||
TP_printk("free %u free_inc %u/%u unused %u", __entry->free,
|
||||
__entry->free_inc, __entry->free_inc_size, __entry->unused)
|
||||
);
|
||||
|
||||
TRACE_EVENT(bcache_alloc_fail,
|
||||
TP_PROTO(struct cache *ca),
|
||||
TP_ARGS(ca),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned, free )
|
||||
__field(unsigned, free_inc )
|
||||
__field(unsigned, unused )
|
||||
__field(unsigned, blocked )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->free = fifo_used(&ca->free);
|
||||
__entry->free_inc = fifo_used(&ca->free_inc);
|
||||
__entry->unused = fifo_used(&ca->unused);
|
||||
__entry->blocked = atomic_read(&ca->set->prio_blocked);
|
||||
),
|
||||
|
||||
TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free,
|
||||
__entry->free_inc, __entry->unused, __entry->blocked)
|
||||
);
|
||||
|
||||
/* Background writeback */
|
||||
|
||||
DEFINE_EVENT(bkey, bcache_writeback,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(bkey, bcache_writeback_collision,
|
||||
TP_PROTO(struct bkey *k),
|
||||
TP_ARGS(k)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_BCACHE_H */
|
||||
|
@ -102,6 +102,30 @@ typedef uint64_t blkif_sector_t;
|
||||
*/
|
||||
#define BLKIF_OP_DISCARD 5
|
||||
|
||||
/*
|
||||
* Recognized if "feature-max-indirect-segments" in present in the backend
|
||||
* xenbus info. The "feature-max-indirect-segments" node contains the maximum
|
||||
* number of segments allowed by the backend per request. If the node is
|
||||
* present, the frontend might use blkif_request_indirect structs in order to
|
||||
* issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
|
||||
* maximum number of indirect segments is fixed by the backend, but the
|
||||
* frontend can issue requests with any number of indirect segments as long as
|
||||
* it's less than the number provided by the backend. The indirect_grefs field
|
||||
* in blkif_request_indirect should be filled by the frontend with the
|
||||
* grant references of the pages that are holding the indirect segments.
|
||||
* This pages are filled with an array of blkif_request_segment_aligned
|
||||
* that hold the information about the segments. The number of indirect
|
||||
* pages to use is determined by the maximum number of segments
|
||||
* a indirect request contains. Every indirect page can contain a maximum
|
||||
* of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
|
||||
* so to calculate the number of indirect pages to use we have to do
|
||||
* ceil(indirect_segments/512).
|
||||
*
|
||||
* If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
|
||||
* create the "feature-max-indirect-segments" node!
|
||||
*/
|
||||
#define BLKIF_OP_INDIRECT 6
|
||||
|
||||
/*
|
||||
* Maximum scatter/gather segments per request.
|
||||
* This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
|
||||
@ -109,6 +133,16 @@ typedef uint64_t blkif_sector_t;
|
||||
*/
|
||||
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
|
||||
|
||||
#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
|
||||
|
||||
struct blkif_request_segment_aligned {
|
||||
grant_ref_t gref; /* reference to I/O buffer frame */
|
||||
/* @first_sect: first sector in frame to transfer (inclusive). */
|
||||
/* @last_sect: last sector in frame to transfer (inclusive). */
|
||||
uint8_t first_sect, last_sect;
|
||||
uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_request_rw {
|
||||
uint8_t nr_segments; /* number of segments */
|
||||
blkif_vdev_t handle; /* only for read/write requests */
|
||||
@ -147,12 +181,31 @@ struct blkif_request_other {
|
||||
uint64_t id; /* private guest value, echoed in resp */
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_request_indirect {
|
||||
uint8_t indirect_op;
|
||||
uint16_t nr_segments;
|
||||
#ifdef CONFIG_X86_64
|
||||
uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */
|
||||
#endif
|
||||
uint64_t id;
|
||||
blkif_sector_t sector_number;
|
||||
blkif_vdev_t handle;
|
||||
uint16_t _pad2;
|
||||
grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
|
||||
#ifdef CONFIG_X86_64
|
||||
uint32_t _pad3; /* make it 64 byte aligned */
|
||||
#else
|
||||
uint64_t _pad3; /* make it 64 byte aligned */
|
||||
#endif
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct blkif_request {
|
||||
uint8_t operation; /* BLKIF_OP_??? */
|
||||
union {
|
||||
struct blkif_request_rw rw;
|
||||
struct blkif_request_discard discard;
|
||||
struct blkif_request_other other;
|
||||
struct blkif_request_indirect indirect;
|
||||
} u;
|
||||
} __attribute__((__packed__));
|
||||
|
||||
|
@ -188,6 +188,11 @@ struct __name##_back_ring { \
|
||||
#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \
|
||||
(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
|
||||
|
||||
/* Ill-behaved frontend determination: Can there be this many requests? */
|
||||
#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \
|
||||
(((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r))
|
||||
|
||||
|
||||
#define RING_PUSH_REQUESTS(_r) do { \
|
||||
wmb(); /* back sees requests /before/ updated producer index */ \
|
||||
(_r)->sring->req_prod = (_r)->req_prod_pvt; \
|
||||
|
Loading…
Reference in New Issue
Block a user