linux/drivers/md/bcache/super.c

2164 lines
52 KiB
C
Raw Normal View History

/*
* bcache setup/teardown code, and some metadata io - read a superblock and
* figure out what to do with it.
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
*/
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "extents.h"
#include "request.h"
#include "writeback.h"
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/debugfs.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/sysfs.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
};
static const char invalid_uuid[] = {
0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
};
/* Default is -1; we skip past it for struct cached_dev's cache mode */
const char * const bch_cache_modes[] = {
"default",
"writethrough",
"writeback",
"writearound",
"none",
NULL
};
static struct kobject *bcache_kobj;
struct mutex bch_register_lock;
LIST_HEAD(bch_cache_sets);
static LIST_HEAD(uncached_devices);
static int bcache_major;
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
/* limitation of partitions number on single bcache device */
#define BCACHE_MINORS 128
/* limitation of bcache devices number on single system */
#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
/* Superblock */
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
struct page **res)
{
const char *err;
struct cache_sb *s;
struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
unsigned i;
if (!bh)
return "IO error";
s = (struct cache_sb *) bh->b_data;
sb->offset = le64_to_cpu(s->offset);
sb->version = le64_to_cpu(s->version);
memcpy(sb->magic, s->magic, 16);
memcpy(sb->uuid, s->uuid, 16);
memcpy(sb->set_uuid, s->set_uuid, 16);
memcpy(sb->label, s->label, SB_LABEL_SIZE);
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
sb->last_mount = le32_to_cpu(s->last_mount);
sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
sb->d[i] = le64_to_cpu(s->d[i]);
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
sb->version, sb->flags, sb->seq, sb->keys);
err = "Not a bcache superblock";
if (sb->offset != SB_SECTOR)
goto err;
if (memcmp(sb->magic, bcache_magic, 16))
goto err;
err = "Too many journal buckets";
if (sb->keys > SB_JOURNAL_BUCKETS)
goto err;
err = "Bad checksum";
if (s->csum != csum_set(s))
goto err;
err = "Bad UUID";
if (bch_is_zero(sb->uuid, 16))
goto err;
sb->block_size = le16_to_cpu(s->block_size);
err = "Superblock block size smaller than device block size";
if (sb->block_size << 9 < bdev_logical_block_size(bdev))
goto err;
switch (sb->version) {
case BCACHE_SB_VERSION_BDEV:
sb->data_offset = BDEV_DATA_START_DEFAULT;
break;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
sb->data_offset = le64_to_cpu(s->data_offset);
err = "Bad data offset";
if (sb->data_offset < BDEV_DATA_START_DEFAULT)
goto err;
break;
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->bucket_size = le16_to_cpu(s->bucket_size);
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
err = "Too many buckets";
if (sb->nbuckets > LONG_MAX)
goto err;
err = "Not enough buckets";
if (sb->nbuckets < 1 << 7)
goto err;
err = "Bad block/bucket size";
if (!is_power_of_2(sb->block_size) ||
sb->block_size > PAGE_SECTORS ||
!is_power_of_2(sb->bucket_size) ||
sb->bucket_size < PAGE_SECTORS)
goto err;
err = "Invalid superblock: device too small";
if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
goto err;
err = "Bad UUID";
if (bch_is_zero(sb->set_uuid, 16))
goto err;
err = "Bad cache device number in set";
if (!sb->nr_in_set ||
sb->nr_in_set <= sb->nr_this_dev ||
sb->nr_in_set > MAX_CACHES_PER_SET)
goto err;
err = "Journal buckets not sequential";
for (i = 0; i < sb->keys; i++)
if (sb->d[i] != sb->first_bucket + i)
goto err;
err = "Too many journal buckets";
if (sb->first_bucket + sb->keys > sb->nbuckets)
goto err;
err = "Invalid superblock: first bucket comes before end of super";
if (sb->first_bucket * sb->bucket_size < 16)
goto err;
break;
default:
err = "Unsupported superblock version";
goto err;
}
sb->last_mount = get_seconds();
err = NULL;
get_page(bh->b_page);
*res = bh->b_page;
err:
put_bh(bh);
return err;
}
static void write_bdev_super_endio(struct bio *bio)
{
struct cached_dev *dc = bio->bi_private;
/* XXX: error checking */
closure_put(&dc->sb_write);
}
static void __write_super(struct cache_sb *sb, struct bio *bio)
{
struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned i;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_sector = SB_SECTOR;
bio->bi_iter.bi_size = SB_SIZE;
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
bch_bio_map(bio, NULL);
out->offset = cpu_to_le64(sb->offset);
out->version = cpu_to_le64(sb->version);
memcpy(out->uuid, sb->uuid, 16);
memcpy(out->set_uuid, sb->set_uuid, 16);
memcpy(out->label, sb->label, SB_LABEL_SIZE);
out->flags = cpu_to_le64(sb->flags);
out->seq = cpu_to_le64(sb->seq);
out->last_mount = cpu_to_le32(sb->last_mount);
out->first_bucket = cpu_to_le16(sb->first_bucket);
out->keys = cpu_to_le16(sb->keys);
for (i = 0; i < sb->keys; i++)
out->d[i] = cpu_to_le64(sb->d[i]);
out->csum = csum_set(out);
pr_debug("ver %llu, flags %llu, seq %llu",
sb->version, sb->flags, sb->seq);
submit_bio(bio);
}
static void bch_write_bdev_super_unlock(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
up(&dc->sb_write_mutex);
}
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
{
struct closure *cl = &dc->sb_write;
struct bio *bio = &dc->sb_bio;
down(&dc->sb_write_mutex);
closure_init(cl, parent);
bio_reset(bio);
bio_set_dev(bio, dc->bdev);
bio->bi_end_io = write_bdev_super_endio;
bio->bi_private = dc;
closure_get(cl);
__write_super(&dc->sb, bio);
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
}
static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
/* is_read = 0 */
bch_count_io_errors(ca, bio->bi_status, 0,
"writing superblock");
closure_put(&ca->set->sb_write);
}
static void bcache_write_super_unlock(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, sb_write);
up(&c->sb_write_mutex);
}
void bcache_write_super(struct cache_set *c)
{
struct closure *cl = &c->sb_write;
struct cache *ca;
unsigned i;
down(&c->sb_write_mutex);
closure_init(cl, &c->cl);
c->sb.seq++;
for_each_cache(ca, c, i) {
struct bio *bio = &ca->sb_bio;
ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
ca->sb.seq = c->sb.seq;
ca->sb.last_mount = c->sb.last_mount;
SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
bio_reset(bio);
bio_set_dev(bio, ca->bdev);
bio->bi_end_io = write_super_endio;
bio->bi_private = ca;
closure_get(cl);
__write_super(&ca->sb, bio);
}
closure_return_with_destructor(cl, bcache_write_super_unlock);
}
/* UUID io */
static void uuid_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
cache_set_err_on(bio->bi_status, c, "accessing uuids");
bch_bbio_free(bio, c);
closure_put(cl);
}
static void uuid_io_unlock(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
up(&c->uuid_write_mutex);
}
static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
struct bkey *k, struct closure *parent)
{
struct closure *cl = &c->uuid_write;
struct uuid_entry *u;
unsigned i;
char buf[80];
BUG_ON(!parent);
down(&c->uuid_write_mutex);
closure_init(cl, parent);
for (i = 0; i < KEY_PTRS(k); i++) {
struct bio *bio = bch_bbio_alloc(c);
bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
bio->bi_end_io = uuid_endio;
bio->bi_private = cl;
bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
bch_bio_map(bio, c->uuids);
bch_submit_bbio(bio, c, k, i);
if (op != REQ_OP_WRITE)
break;
}
bch_extent_to_text(buf, sizeof(buf), k);
pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
if (!bch_is_zero(u->uuid, 16))
pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
u - c->uuids, u->uuid, u->label,
u->first_reg, u->last_reg, u->invalidated);
closure_return_with_destructor(cl, uuid_io_unlock);
}
static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
{
struct bkey *k = &j->uuid_bucket;
if (__bch_btree_ptr_invalid(c, k))
return "bad uuid pointer";
bkey_copy(&c->uuid_bucket, k);
uuid_io(c, REQ_OP_READ, 0, k, cl);
if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
struct uuid_entry_v0 *u0 = (void *) c->uuids;
struct uuid_entry *u1 = (void *) c->uuids;
int i;
closure_sync(cl);
/*
* Since the new uuid entry is bigger than the old, we have to
* convert starting at the highest memory address and work down
* in order to do it in place
*/
for (i = c->nr_uuids - 1;
i >= 0;
--i) {
memcpy(u1[i].uuid, u0[i].uuid, 16);
memcpy(u1[i].label, u0[i].label, 32);
u1[i].first_reg = u0[i].first_reg;
u1[i].last_reg = u0[i].last_reg;
u1[i].invalidated = u0[i].invalidated;
u1[i].flags = 0;
u1[i].sectors = 0;
}
}
return NULL;
}
static int __uuid_write(struct cache_set *c)
{
BKEY_PADDED(key) k;
struct closure cl;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
return 1;
SET_KEY_SIZE(&k.key, c->sb.bucket_size);
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
bkey_copy(&c->uuid_bucket, &k.key);
bkey_put(c, &k.key);
return 0;
}
int bch_uuid_write(struct cache_set *c)
{
int ret = __uuid_write(c);
if (!ret)
bch_journal_meta(c, NULL);
return ret;
}
static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
{
struct uuid_entry *u;
for (u = c->uuids;
u < c->uuids + c->nr_uuids; u++)
if (!memcmp(u->uuid, uuid, 16))
return u;
return NULL;
}
static struct uuid_entry *uuid_find_empty(struct cache_set *c)
{
static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
return uuid_find(c, zero_uuid);
}
/*
* Bucket priorities/gens:
*
* For each bucket, we store on disk its
* 8 bit gen
* 16 bit priority
*
* See alloc.c for an explanation of the gen. The priority is used to implement
* lru (and in the future other) cache replacement policies; for most purposes
* it's just an opaque integer.
*
* The gens and the priorities don't have a whole lot to do with each other, and
* it's actually the gens that must be written out at specific times - it's no
* big deal if the priorities don't get written, if we lose them we just reuse
* buckets in suboptimal order.
*
* On disk they're stored in a packed array, and in as many buckets are required
* to fit them all. The buckets we use to store them form a list; the journal
* header points to the first bucket, the first bucket points to the second
* bucket, et cetera.
*
* This code is used by the allocation code; periodically (whenever it runs out
* of buckets to allocate from) the allocation code will invalidate some
* buckets, but it can't use those buckets until their new gens are safely on
* disk.
*/
static void prio_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
bch_bbio_free(bio, ca->set);
closure_put(&ca->prio);
}
static void prio_io(struct cache *ca, uint64_t bucket, int op,
unsigned long op_flags)
{
struct closure *cl = &ca->prio;
struct bio *bio = bch_bbio_alloc(ca->set);
closure_init_stack(cl);
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
bio_set_dev(bio, ca->bdev);
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_size = bucket_bytes(ca);
bio->bi_end_io = prio_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
bch_bio_map(bio, ca->disk_buckets);
closure_bio_submit(bio, &ca->prio);
closure_sync(cl);
}
void bch_prio_write(struct cache *ca)
{
int i;
struct bucket *b;
struct closure cl;
closure_init_stack(&cl);
lockdep_assert_held(&ca->set->bucket_lock);
ca->disk_buckets->seq++;
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
&ca->meta_sectors_written);
//pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
// fifo_used(&ca->free_inc), fifo_used(&ca->unused));
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
long bucket;
struct prio_set *p = ca->disk_buckets;
struct bucket_disk *d = p->data;
struct bucket_disk *end = d + prios_per_bucket(ca);
for (b = ca->buckets + i * prios_per_bucket(ca);
b < ca->buckets + ca->sb.nbuckets && d < end;
b++, d++) {
d->prio = cpu_to_le16(b->prio);
d->gen = b->gen;
}
p->next_bucket = ca->prio_buckets[i + 1];
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
BUG_ON(bucket == -1);
mutex_unlock(&ca->set->bucket_lock);
prio_io(ca, bucket, REQ_OP_WRITE, 0);
mutex_lock(&ca->set->bucket_lock);
ca->prio_buckets[i] = bucket;
atomic_dec_bug(&ca->buckets[bucket].pin);
}
mutex_unlock(&ca->set->bucket_lock);
bch_journal_meta(ca->set, &cl);
closure_sync(&cl);
mutex_lock(&ca->set->bucket_lock);
/*
* Don't want the old priorities to get garbage collected until after we
* finish writing the new ones, and they're journalled
*/
for (i = 0; i < prio_buckets(ca); i++) {
if (ca->prio_last_buckets[i])
__bch_bucket_free(ca,
&ca->buckets[ca->prio_last_buckets[i]]);
ca->prio_last_buckets[i] = ca->prio_buckets[i];
}
}
static void prio_read(struct cache *ca, uint64_t bucket)
{
struct prio_set *p = ca->disk_buckets;
struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
struct bucket *b;
unsigned bucket_nr = 0;
for (b = ca->buckets;
b < ca->buckets + ca->sb.nbuckets;
b++, d++) {
if (d == end) {
ca->prio_buckets[bucket_nr] = bucket;
ca->prio_last_buckets[bucket_nr] = bucket;
bucket_nr++;
prio_io(ca, bucket, REQ_OP_READ, 0);
if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
pr_warn("bad csum reading priorities");
if (p->magic != pset_magic(&ca->sb))
pr_warn("bad magic reading priorities");
bucket = p->next_bucket;
d = p->data;
}
b->prio = le16_to_cpu(d->prio);
b->gen = b->last_gc = d->gen;
}
}
/* Bcache device */
static int open_dev(struct block_device *b, fmode_t mode)
{
struct bcache_device *d = b->bd_disk->private_data;
if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
return -ENXIO;
closure_get(&d->cl);
return 0;
}
static void release_dev(struct gendisk *b, fmode_t mode)
{
struct bcache_device *d = b->private_data;
closure_put(&d->cl);
}
static int ioctl_dev(struct block_device *b, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct bcache_device *d = b->bd_disk->private_data;
return d->ioctl(d, mode, cmd, arg);
}
static const struct block_device_operations bcache_ops = {
.open = open_dev,
.release = release_dev,
.ioctl = ioctl_dev,
.owner = THIS_MODULE,
};
void bcache_device_stop(struct bcache_device *d)
{
if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
closure_queue(&d->cl);
}
static void bcache_device_unlink(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
unsigned i;
struct cache *ca;
sysfs_remove_link(&d->c->kobj, d->name);
sysfs_remove_link(&d->kobj, "cache");
for_each_cache(ca, d->c, i)
bd_unlink_disk_holder(ca->bdev, d->disk);
}
}
static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
const char *name)
{
unsigned i;
struct cache *ca;
for_each_cache(ca, d->c, i)
bd_link_disk_holder(ca->bdev, d->disk);
snprintf(d->name, BCACHEDEVNAME_SIZE,
"%s%u", name, d->id);
WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
sysfs_create_link(&c->kobj, &d->kobj, d->name),
"Couldn't create device <-> cache set symlinks");
bcache: clear BCACHE_DEV_UNLINK_DONE flag when attaching a backing device This bug can be reproduced by the following script: #!/bin/bash bcache_sysfs="/sys/fs/bcache" function clear_cache() { if [ ! -e $bcache_sysfs ]; then echo "no bcache sysfs" exit fi cset_uuid=$(ls -l $bcache_sysfs|head -n 2|tail -n 1|awk '{print $9}') sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/detach" sleep 5 sudo sh -c "echo $cset_uuid > /sys/block/sdb/sdb1/bcache/attach" } for ((i=0;i<10;i++)); do clear_cache done The warning messages look like below: [ 275.948611] ------------[ cut here ]------------ [ 275.963840] WARNING: at fs/sysfs/dir.c:512 sysfs_add_one+0xb8/0xd0() (Tainted: P W --------------- ) [ 275.979253] Hardware name: Tecal RH2285 [ 275.994106] sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:09.0/0000:08:00.0/host4/target4:2:1/4:2:1:0/block/sdb/sdb1/bcache/cache' [ 276.024105] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan] [ 276.072643] Pid: 2765, comm: sh Tainted: P W --------------- 2.6.32 #1 [ 276.089315] Call Trace: [ 276.105801] [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0 [ 276.122650] [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50 [ 276.139361] [<ffffffff81205c08>] ? sysfs_add_one+0xb8/0xd0 [ 276.156012] [<ffffffff8120609b>] ? sysfs_do_create_link+0x12b/0x170 [ 276.172682] [<ffffffff81206113>] ? sysfs_create_link+0x13/0x20 [ 276.189282] [<ffffffffa03bda21>] ? bcache_device_link+0xc1/0x110 [bcache] [ 276.205993] [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache] [ 276.222794] [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache] [ 276.239680] [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110 [ 276.256594] [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170 [ 276.273364] [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0 [ 276.290133] [<ffffffff811890b1>] ? sys_write+0x51/0x90 [ 276.306368] [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b [ 276.322301] ---[ end trace 9f5d4fcdd0c3edfb ]--- [ 276.338241] ------------[ cut here ]------------ [ 276.354109] WARNING: at /home/wenqing.lz/bcache/bcache/super.c:720 bcache_device_link+0xdf/0x110 [bcache]() (Tainted: P W --------------- ) [ 276.386017] Hardware name: Tecal RH2285 [ 276.401430] Couldn't create device <-> cache set symlinks [ 276.401759] Modules linked in: bcache tcp_diag inet_diag ipmi_devintf ipmi_si ipmi_msghandler bonding 8021q garp stp llc ipv6 ext3 jbd loop sg iomemory_vsl(P) bnx2 microcode serio_raw i2c_i801 i2c_core iTCO_wdt iTCO_vendor_support i7core_edac edac_core shpchp ext4 jbd2 mbcache megaraid_sas pata_acpi ata_generic ata_piix dm_mod [last unloaded: scsi_wait_scan] [ 276.465477] Pid: 2765, comm: sh Tainted: P W --------------- 2.6.32 #1 [ 276.482169] Call Trace: [ 276.498610] [<ffffffff81070fe7>] ? warn_slowpath_common+0x87/0xc0 [ 276.515405] [<ffffffff810710d6>] ? warn_slowpath_fmt+0x46/0x50 [ 276.532059] [<ffffffffa03bda3f>] ? bcache_device_link+0xdf/0x110 [bcache] [ 276.548808] [<ffffffffa03bfa08>] ? bch_cached_dev_attach+0x478/0x4f0 [bcache] [ 276.565569] [<ffffffffa03c4a17>] ? bch_cached_dev_store+0x627/0x780 [bcache] [ 276.582418] [<ffffffff8116783a>] ? alloc_pages_current+0xaa/0x110 [ 276.599341] [<ffffffff81203b15>] ? sysfs_write_file+0xe5/0x170 [ 276.616142] [<ffffffff811887b8>] ? vfs_write+0xb8/0x1a0 [ 276.632607] [<ffffffff811890b1>] ? sys_write+0x51/0x90 [ 276.648671] [<ffffffff8100c072>] ? system_call_fastpath+0x16/0x1b [ 276.664756] ---[ end trace 9f5d4fcdd0c3edfc ]--- We forget to clear BCACHE_DEV_UNLINK_DONE flag in bcache_device_attach() function when we attach a backing device first time. After detaching this backing device, this flag will be true and sysfs_remove_link() isn't called in bcache_device_unlink(). Then when we attach this backing device again, sysfs_create_link() will return EEXIST error in bcache_device_link(). So the fix is trival and we clear this flag in bcache_device_link(). Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Tested-by: Joshua Schmid <jschmid@suse.com> Tested-by: Eric Wheeler <bcache@linux.ewheeler.net> Cc: Kent Overstreet <kmo@daterainc.com> Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe <axboe@fb.com>
2015-11-30 01:19:32 +00:00
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
}
static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
struct uuid_entry *u = d->c->uuids + d->id;
SET_UUID_FLASH_ONLY(u, 0);
memcpy(u->uuid, invalid_uuid, 16);
u->invalidated = cpu_to_le32(get_seconds());
bch_uuid_write(d->c);
}
bcache_device_unlink(d);
d->c->devices[d->id] = NULL;
closure_put(&d->c->caching);
d->c = NULL;
}
static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
unsigned id)
{
d->id = id;
d->c = c;
c->devices[id] = d;
if (id >= c->devices_max_used)
c->devices_max_used = id + 1;
closure_get(&c->caching);
}
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
static inline int first_minor_to_idx(int first_minor)
{
return (first_minor/BCACHE_MINORS);
}
static inline int idx_to_first_minor(int idx)
{
return (idx * BCACHE_MINORS);
}
static void bcache_device_free(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
pr_info("%s stopped", d->disk->disk_name);
if (d->c)
bcache_device_detach(d);
if (d->disk && d->disk->flags & GENHD_FL_UP)
del_gendisk(d->disk);
if (d->disk && d->disk->queue)
blk_cleanup_queue(d->disk->queue);
if (d->disk) {
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
ida_simple_remove(&bcache_device_idx,
first_minor_to_idx(d->disk->first_minor));
put_disk(d->disk);
}
if (d->bio_split)
bioset_free(d->bio_split);
kvfree(d->full_dirty_stripes);
kvfree(d->stripe_sectors_dirty);
closure_debug_destroy(&d->cl);
}
static int bcache_device_init(struct bcache_device *d, unsigned block_size,
sector_t sectors)
{
struct request_queue *q;
size_t n;
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
if (!d->nr_stripes ||
d->nr_stripes > INT_MAX ||
d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
(unsigned)d->nr_stripes);
return -ENOMEM;
}
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
if (!d->stripe_sectors_dirty)
return -ENOMEM;
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
if (!d->full_dirty_stripes)
return -ENOMEM;
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
idx = ida_simple_get(&bcache_device_idx, 0,
BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
if (idx < 0)
return idx;
if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS |
BIOSET_NEED_RESCUER)) ||
!(d->disk = alloc_disk(BCACHE_MINORS))) {
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
ida_simple_remove(&bcache_device_idx, idx);
return -ENOMEM;
}
set_capacity(d->disk, sectors);
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
d->disk->major = bcache_major;
bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li <colyli@suse.de> Cc: Eric Wheeler <bcache@lists.ewheeler.net> Cc: Junhui Tang <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-13 23:35:31 +00:00
d->disk->first_minor = idx_to_first_minor(idx);
d->disk->fops = &bcache_ops;
d->disk->private_data = d;
q = blk_alloc_queue(GFP_KERNEL);
if (!q)
return -ENOMEM;
blk_queue_make_request(q, NULL);
d->disk->queue = q;
q->queuedata = d;
q->backing_dev_info->congested_data = d;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_PAGES;
blk_queue_max_discard_sectors(q, UINT_MAX);
q->limits.discard_granularity = 512;
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
blk_queue_write_cache(q, true, true);
return 0;
}
/* Cached device */
static void calc_cached_dev_sectors(struct cache_set *c)
{
uint64_t sectors = 0;
struct cached_dev *dc;
list_for_each_entry(dc, &c->cached_devs, list)
sectors += bdev_sectors(dc->bdev);
c->cached_dev_sectors = sectors;
}
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
char buf[SB_LABEL_SIZE + 1];
char *env[] = {
"DRIVER=bcache",
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
NULL,
NULL,
};
memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
buf[SB_LABEL_SIZE] = '\0';
env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
return;
}
if (!d->c &&
BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
struct closure cl;
closure_init_stack(&cl);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
}
add_disk(d->disk);
bd_link_disk_holder(dc->bdev, dc->disk.disk);
/* won't show up in the uevent file, use udevadm monitor -e instead
* only class / kset properties are persistent */
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
kfree(env[1]);
kfree(env[2]);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
pr_debug("error creating sysfs link");
}
static void cached_dev_detach_finish(struct work_struct *w)
{
struct cached_dev *dc = container_of(w, struct cached_dev, detach);
char buf[BDEVNAME_SIZE];
struct closure cl;
closure_init_stack(&cl);
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
BUG_ON(refcount_read(&dc->count));
mutex_lock(&bch_register_lock);
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
kthread_stop(dc->writeback_thread);
dc->writeback_thread = NULL;
}
memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
bcache_device_detach(&dc->disk);
list_move(&dc->list, &uncached_devices);
clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
mutex_unlock(&bch_register_lock);
pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
/* Drop ref we took in cached_dev_detach() */
closure_put(&dc->disk.cl);
}
void bch_cached_dev_detach(struct cached_dev *dc)
{
lockdep_assert_held(&bch_register_lock);
if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
return;
if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
return;
/*
* Block the device from being closed and freed until we're finished
* detaching
*/
closure_get(&dc->disk.cl);
bch_writeback_queue(dc);
cached_dev_put(dc);
}
bcache: fix for data collapse after re-attaching an attached device back-end device sdm has already attached a cache_set with ID f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with another cache set, and it returns with an error: [root]# cd /sys/block/sdm/bcache [root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach -bash: echo: write error: Invalid argument After that, execute a command to modify the label of bcache device: [root]# echo data_disk1 > label Then we reboot the system, when the system power on, the back-end device can not attach to cache_set, a messages show in the log: Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache: bch_cached_dev_attach() couldn't find uuid for sdm in set In sysfs_attach(), dc->sb.set_uuid was assigned to the value which input through sysfs, no matter whether it is success or not in bch_cached_dev_attach(). For example, If the back-end device has already attached to an cache set, bch_cached_dev_attach() would fail, but dc->sb.set_uuid was changed. Then modify the label of bcache device, it will call bch_write_bdev_super(), which would write the dc->sb.set_uuid to the super block, so we record a wrong cache set ID in the super block, after the system reboot, the cache set couldn't find the uuid of the back-end device, so the bcache device couldn't exist and use any more. In this patch, we don't assigned cache set ID to dc->sb.set_uuid in sysfs_attach() directly, but input it into bch_cached_dev_attach(), and assigned dc->sb.set_uuid to the cache set ID after the back-end device attached to the cache set successful. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:46 +00:00
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid)
{
uint32_t rtime = cpu_to_le32(get_seconds());
struct uuid_entry *u;
char buf[BDEVNAME_SIZE];
bdevname(dc->bdev, buf);
bcache: fix for data collapse after re-attaching an attached device back-end device sdm has already attached a cache_set with ID f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with another cache set, and it returns with an error: [root]# cd /sys/block/sdm/bcache [root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach -bash: echo: write error: Invalid argument After that, execute a command to modify the label of bcache device: [root]# echo data_disk1 > label Then we reboot the system, when the system power on, the back-end device can not attach to cache_set, a messages show in the log: Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache: bch_cached_dev_attach() couldn't find uuid for sdm in set In sysfs_attach(), dc->sb.set_uuid was assigned to the value which input through sysfs, no matter whether it is success or not in bch_cached_dev_attach(). For example, If the back-end device has already attached to an cache set, bch_cached_dev_attach() would fail, but dc->sb.set_uuid was changed. Then modify the label of bcache device, it will call bch_write_bdev_super(), which would write the dc->sb.set_uuid to the super block, so we record a wrong cache set ID in the super block, after the system reboot, the cache set couldn't find the uuid of the back-end device, so the bcache device couldn't exist and use any more. In this patch, we don't assigned cache set ID to dc->sb.set_uuid in sysfs_attach() directly, but input it into bch_cached_dev_attach(), and assigned dc->sb.set_uuid to the cache set ID after the back-end device attached to the cache set successful. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:46 +00:00
if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
(!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
return -ENOENT;
if (dc->disk.c) {
pr_err("Can't attach %s: already attached", buf);
return -EINVAL;
}
if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
pr_err("Can't attach %s: shutting down", buf);
return -EINVAL;
}
if (dc->sb.block_size < c->sb.block_size) {
/* Will die */
pr_err("Couldn't attach %s: block size less than set's block size",
buf);
return -EINVAL;
}
u = uuid_find(c, dc->sb.uuid);
if (u &&
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
memcpy(u->uuid, invalid_uuid, 16);
u->invalidated = cpu_to_le32(get_seconds());
u = NULL;
}
if (!u) {
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
pr_err("Couldn't find uuid for %s in set", buf);
return -ENOENT;
}
u = uuid_find_empty(c);
if (!u) {
pr_err("Not caching %s, no room for UUID", buf);
return -EINVAL;
}
}
/* Deadlocks since we're called via sysfs...
sysfs_remove_file(&dc->kobj, &sysfs_attach);
*/
if (bch_is_zero(u->uuid, 16)) {
struct closure cl;
closure_init_stack(&cl);
memcpy(u->uuid, dc->sb.uuid, 16);
memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
u->first_reg = u->last_reg = rtime;
bch_uuid_write(c);
memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
bch_write_bdev_super(dc, &cl);
closure_sync(&cl);
} else {
u->last_reg = rtime;
bch_uuid_write(c);
}
bcache_device_attach(&dc->disk, c, u - c->uuids);
list_move(&dc->list, &c->cached_devs);
calc_cached_dev_sectors(c);
smp_wmb();
/*
* dc->c must be set before dc->count != 0 - paired with the mb in
* cached_dev_get()
*/
refcount_set(&dc->count, 1);
/* Block writeback thread, but spawn it */
down_write(&dc->writeback_lock);
if (bch_cached_dev_writeback_start(dc)) {
up_write(&dc->writeback_lock);
return -ENOMEM;
}
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
bch_sectors_dirty_init(&dc->disk);
atomic_set(&dc->has_dirty, 1);
refcount_inc(&dc->count);
bch_writeback_queue(dc);
}
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
pr_info("Caching %s as %s on set %pU",
bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
dc->disk.c->sb.set_uuid);
return 0;
}
void bch_cached_dev_release(struct kobject *kobj)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
disk.kobj);
kfree(dc);
module_put(THIS_MODULE);
}
static void cached_dev_free(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
cancel_delayed_work_sync(&dc->writeback_rate_update);
if (!IS_ERR_OR_NULL(dc->writeback_thread))
kthread_stop(dc->writeback_thread);
if (dc->writeback_write_wq)
destroy_workqueue(dc->writeback_write_wq);
mutex_lock(&bch_register_lock);
if (atomic_read(&dc->running))
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
bcache_device_free(&dc->disk);
list_del(&dc->list);
mutex_unlock(&bch_register_lock);
if (!IS_ERR_OR_NULL(dc->bdev))
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
wake_up(&unregister_wait);
kobject_put(&dc->disk.kobj);
}
static void cached_dev_flush(struct closure *cl)
{
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
struct bcache_device *d = &dc->disk;
mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
mutex_unlock(&bch_register_lock);
bch_cache_accounting_destroy(&dc->accounting);
kobject_del(&d->kobj);
continue_at(cl, cached_dev_free, system_wq);
}
static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
{
int ret;
struct io *io;
struct request_queue *q = bdev_get_queue(dc->bdev);
__module_get(THIS_MODULE);
INIT_LIST_HEAD(&dc->list);
closure_init(&dc->disk.cl, NULL);
set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
INIT_WORK(&dc->detach, cached_dev_detach_finish);
sema_init(&dc->sb_write_mutex, 1);
INIT_LIST_HEAD(&dc->io_lru);
spin_lock_init(&dc->io_lock);
bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
dc->sequential_cutoff = 4 << 20;
for (io = dc->io; io < dc->io + RECENT_IO; io++) {
list_add(&io->lru, &dc->io_lru);
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
dc->disk.stripe_size = q->limits.io_opt >> 9;
if (dc->disk.stripe_size)
dc->partial_stripes_expensive =
q->limits.raid_partial_stripes_expensive;
ret = bcache_device_init(&dc->disk, block_size,
dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
if (ret)
return ret;
dc->disk.disk->queue->backing_dev_info->ra_pages =
max(dc->disk.disk->queue->backing_dev_info->ra_pages,
q->backing_dev_info->ra_pages);
bch_cached_dev_request_init(dc);
bch_cached_dev_writeback_init(dc);
return 0;
}
/* Cached device - bcache superblock */
static void register_bdev(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev,
struct cached_dev *dc)
{
char name[BDEVNAME_SIZE];
const char *err = "cannot allocate memory";
struct cache_set *c;
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
dc->bdev = bdev;
dc->bdev->bd_holder = dc;
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (cached_dev_init(dc, sb->block_size << 9))
goto err;
err = "error creating kobject";
if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
"bcache"))
goto err;
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
goto err;
pr_info("registered backing device %s", bdevname(bdev, name));
list_add(&dc->list, &uncached_devices);
list_for_each_entry(c, &bch_cache_sets, list)
bcache: fix for data collapse after re-attaching an attached device back-end device sdm has already attached a cache_set with ID f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with another cache set, and it returns with an error: [root]# cd /sys/block/sdm/bcache [root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach -bash: echo: write error: Invalid argument After that, execute a command to modify the label of bcache device: [root]# echo data_disk1 > label Then we reboot the system, when the system power on, the back-end device can not attach to cache_set, a messages show in the log: Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache: bch_cached_dev_attach() couldn't find uuid for sdm in set In sysfs_attach(), dc->sb.set_uuid was assigned to the value which input through sysfs, no matter whether it is success or not in bch_cached_dev_attach(). For example, If the back-end device has already attached to an cache set, bch_cached_dev_attach() would fail, but dc->sb.set_uuid was changed. Then modify the label of bcache device, it will call bch_write_bdev_super(), which would write the dc->sb.set_uuid to the super block, so we record a wrong cache set ID in the super block, after the system reboot, the cache set couldn't find the uuid of the back-end device, so the bcache device couldn't exist and use any more. In this patch, we don't assigned cache set ID to dc->sb.set_uuid in sysfs_attach() directly, but input it into bch_cached_dev_attach(), and assigned dc->sb.set_uuid to the cache set ID after the back-end device attached to the cache set successful. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:46 +00:00
bch_cached_dev_attach(dc, c, NULL);
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
bch_cached_dev_run(dc);
return;
err:
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
pr_notice("error %s: %s", bdevname(bdev, name), err);
bcache_device_stop(&dc->disk);
}
/* Flash only volumes */
void bch_flash_dev_release(struct kobject *kobj)
{
struct bcache_device *d = container_of(kobj, struct bcache_device,
kobj);
kfree(d);
}
static void flash_dev_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
bcache_device_free(d);
mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
}
static void flash_dev_flush(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
bcache_device_unlink(d);
mutex_unlock(&bch_register_lock);
kobject_del(&d->kobj);
continue_at(cl, flash_dev_free, system_wq);
}
static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
{
struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
GFP_KERNEL);
if (!d)
return -ENOMEM;
closure_init(&d->cl, NULL);
set_closure_fn(&d->cl, flash_dev_flush, system_wq);
kobject_init(&d->kobj, &bch_flash_dev_ktype);
if (bcache_device_init(d, block_bytes(c), u->sectors))
goto err;
bcache_device_attach(d, c, u - c->uuids);
bch_sectors_dirty_init(d);
bch_flash_dev_request_init(d);
add_disk(d->disk);
if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
goto err;
bcache_device_link(d, c, "volume");
return 0;
err:
kobject_put(&d->kobj);
return -ENOMEM;
}
static int flash_devs_run(struct cache_set *c)
{
int ret = 0;
struct uuid_entry *u;
for (u = c->uuids;
u < c->uuids + c->nr_uuids && !ret;
u++)
if (UUID_FLASH_ONLY(u))
ret = flash_dev_run(c, u);
return ret;
}
int bch_flash_dev_create(struct cache_set *c, uint64_t size)
{
struct uuid_entry *u;
if (test_bit(CACHE_SET_STOPPING, &c->flags))
return -EINTR;
if (!test_bit(CACHE_SET_RUNNING, &c->flags))
return -EPERM;
u = uuid_find_empty(c);
if (!u) {
pr_err("Can't create volume, no room for UUID");
return -EINVAL;
}
get_random_bytes(u->uuid, 16);
memset(u->label, 0, 32);
u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
SET_UUID_FLASH_ONLY(u, 1);
u->sectors = size >> 9;
bch_uuid_write(c);
return flash_dev_run(c, u);
}
/* Cache set */
__printf(2, 3)
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
{
va_list args;
if (c->on_error != ON_ERROR_PANIC &&
test_bit(CACHE_SET_STOPPING, &c->flags))
return false;
/* XXX: we can be called from atomic context
acquire_console_sem();
*/
printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
va_start(args, fmt);
vprintk(fmt, args);
va_end(args);
printk(", disabling caching\n");
if (c->on_error == ON_ERROR_PANIC)
panic("panic forced after error\n");
bch_cache_set_unregister(c);
return true;
}
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
kfree(c);
module_put(THIS_MODULE);
}
static void cache_set_free(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, cl);
struct cache *ca;
unsigned i;
if (!IS_ERR_OR_NULL(c->debug))
debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
bch_journal_free(c);
for_each_cache(ca, c, i)
if (ca) {
ca->set = NULL;
c->cache[ca->sb.nr_this_dev] = NULL;
kobject_put(&ca->kobj);
}
bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
if (c->bio_split)
bioset_free(c->bio_split);
if (c->fill_iter)
mempool_destroy(c->fill_iter);
if (c->bio_meta)
mempool_destroy(c->bio_meta);
if (c->search)
mempool_destroy(c->search);
kfree(c->devices);
mutex_lock(&bch_register_lock);
list_del(&c->list);
mutex_unlock(&bch_register_lock);
pr_info("Cache set %pU unregistered", c->sb.set_uuid);
wake_up(&unregister_wait);
closure_debug_destroy(&c->cl);
kobject_put(&c->kobj);
}
static void cache_set_flush(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
struct cache *ca;
struct btree *b;
unsigned i;
bch_cache_accounting_destroy(&c->accounting);
kobject_put(&c->internal);
kobject_del(&c->kobj);
if (c->gc_thread)
kthread_stop(c->gc_thread);
if (!IS_ERR_OR_NULL(c->root))
list_add(&c->root->list, &c->btree_cache);
/* Should skip this if we're unregistering because of an error */
list_for_each_entry(b, &c->btree_cache, list) {
mutex_lock(&b->write_lock);
if (btree_node_dirty(b))
__bch_btree_node_write(b, NULL);
mutex_unlock(&b->write_lock);
}
for_each_cache(ca, c, i)
if (ca->alloc_thread)
kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
cancel_delayed_work_sync(&c->journal.work);
/* flush last journal entry if needed */
c->journal.work.work.func(&c->journal.work.work);
}
closure_return(cl);
}
static void __cache_set_unregister(struct closure *cl)
{
struct cache_set *c = container_of(cl, struct cache_set, caching);
struct cached_dev *dc;
size_t i;
mutex_lock(&bch_register_lock);
for (i = 0; i < c->devices_max_used; i++)
if (c->devices[i]) {
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
dc = container_of(c->devices[i],
struct cached_dev, disk);
bch_cached_dev_detach(dc);
} else {
bcache_device_stop(c->devices[i]);
}
}
mutex_unlock(&bch_register_lock);
continue_at(cl, cache_set_flush, system_wq);
}
void bch_cache_set_stop(struct cache_set *c)
{
if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
closure_queue(&c->caching);
}
void bch_cache_set_unregister(struct cache_set *c)
{
set_bit(CACHE_SET_UNREGISTERING, &c->flags);
bch_cache_set_stop(c);
}
#define alloc_bucket_pages(gfp, c) \
((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
int iter_size;
struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
if (!c)
return NULL;
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
set_closure_fn(&c->cl, cache_set_free, system_wq);
closure_init(&c->caching, &c->cl);
set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
/* Maybe create continue_at_noreturn() and use it here? */
closure_set_stopped(&c->cl);
closure_put(&c->cl);
kobject_init(&c->kobj, &bch_cache_set_ktype);
kobject_init(&c->internal, &bch_cache_set_internal_ktype);
bch_cache_accounting_init(&c->accounting, &c->cl);
memcpy(c->sb.set_uuid, sb->set_uuid, 16);
c->sb.block_size = sb->block_size;
c->sb.bucket_size = sb->bucket_size;
c->sb.nr_in_set = sb->nr_in_set;
c->sb.last_mount = sb->last_mount;
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
sema_init(&c->sb_write_mutex, 1);
mutex_init(&c->bucket_lock);
init_waitqueue_head(&c->btree_cache_wait);
init_waitqueue_head(&c->bucket_wait);
init_waitqueue_head(&c->gc_wait);
sema_init(&c->uuid_write_mutex, 1);
spin_lock_init(&c->btree_gc_time.lock);
spin_lock_init(&c->btree_split_time.lock);
spin_lock_init(&c->btree_read_time.lock);
bch_moving_init_cache_set(c);
INIT_LIST_HEAD(&c->list);
INIT_LIST_HEAD(&c->cached_devs);
INIT_LIST_HEAD(&c->btree_cache);
INIT_LIST_HEAD(&c->btree_cache_freeable);
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
c->search = mempool_create_slab_pool(32, bch_search_cache);
if (!c->search)
goto err;
iter_size = (sb->bucket_size / sb->block_size + 1) *
sizeof(struct btree_iter_set);
if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
!(c->bio_meta = mempool_create_kmalloc_pool(2,
sizeof(struct bbio) + sizeof(struct bio_vec) *
bucket_pages(c))) ||
!(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
!(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS |
BIOSET_NEED_RESCUER)) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
bch_open_buckets_alloc(c) ||
bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
c->congested_write_threshold_us = 20000;
bcache: set error_limit correctly Struct cache uses io_errors for two purposes, - Error decay: when cache set error_decay is set, io_errors is used to generate a small piece of delay when I/O error happens. - I/O errors counter: in order to generate big enough value for error decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a IO_ERROR_SHIFT). In function bch_count_io_errors(), if I/O errors counter reaches cache set error limit, bch_cache_set_error() will be called to retire the whold cache set. But current code is problematic when checking the error limit, see the following code piece from bch_count_io_errors(), 90 if (error) { 91 char buf[BDEVNAME_SIZE]; 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, 93 &ca->io_errors); 94 errors >>= IO_ERROR_SHIFT; 95 96 if (errors < ca->set->error_limit) 97 pr_err("%s: IO error on %s, recovering", 98 bdevname(ca->bdev, buf), m); 99 else 100 bch_cache_set_error(ca->set, 101 "%s: too many IO errors %s", 102 bdevname(ca->bdev, buf), m); 103 } At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real errors counter to compare at line 96. But ca->set->error_limit is initia- lized with an amplified value in bch_cache_set_alloc(), 1545 c->error_limit = 8 << IO_ERROR_SHIFT; It means by default, in bch_count_io_errors(), before 8<<20 errors happened bch_cache_set_error() won't be called to retire the problematic cache device. If the average request size is 64KB, it means bcache won't handle failed device until 512GB data is requested. This is too large to be an I/O threashold. So I believe the correct error limit should be much less. This patch sets default cache set error limit to 8, then in bch_count_io_errors() when errors counter reaches 8 (if it is default value), function bch_cache_set_error() will be called to retire the whole cache set. This patch also removes bits shifting when store or show io_error_limit value via sysfs interface. Nowadays most of SSDs handle internal flash failure automatically by LBA address re-indirect mapping. If an I/O error can be observed by upper layer code, it will be a notable error because that SSD can not re-indirect map the problematic LBA address to an available flash block. This situation indicates the whole SSD will be failed very soon. Therefore setting 8 as the default io error limit value makes sense, it is enough for most of cache devices. Changelog: v2: add reviewed-by from Hannes. v1: initial version for review. Signed-off-by: Coly Li <colyli@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.com> Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: Junhui Tang <tang.junhui@zte.com.cn> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:42 +00:00
c->error_limit = DEFAULT_IO_ERROR_LIMIT;
return c;
err:
bch_cache_set_unregister(c);
return NULL;
}
static void run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
struct cache *ca;
struct closure cl;
unsigned i;
closure_init_stack(&cl);
for_each_cache(ca, c, i)
c->nbuckets += ca->sb.nbuckets;
set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
LIST_HEAD(journal);
struct bkey *k;
struct jset *j;
err = "cannot allocate memory for journal";
if (bch_journal_read(c, &journal))
goto err;
pr_debug("btree_journal_read() done");
err = "no journal entries found";
if (list_empty(&journal))
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
err = "IO error reading priorities";
for_each_cache(ca, c, i)
prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
/*
* If prio_read() fails it'll call cache_set_error and we'll
* tear everything down right away, but if we perhaps checked
* sooner we could avoid journal replay.
*/
k = &j->btree_root;
err = "bad btree root";
if (__bch_btree_ptr_invalid(c, k))
goto err;
err = "error reading btree root";
c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
if (IS_ERR_OR_NULL(c->root))
goto err;
list_del_init(&c->root->list);
rw_unlock(true, c->root);
err = uuid_read(c, j, &cl);
if (err)
goto err;
err = "error in recovery";
if (bch_btree_check(c))
goto err;
bch_journal_mark(c, &journal);
bch_initial_gc_finish(c);
pr_debug("btree_check() done");
/*
* bcache_journal_next() can't happen sooner, or
* btree_gc_finish() will give spurious errors about last_gc >
* gc_gen - this is a hack but oh well.
*/
bch_journal_next(&c->journal);
err = "error starting allocator thread";
for_each_cache(ca, c, i)
if (bch_cache_allocator_start(ca))
goto err;
/*
* First place it's safe to allocate: btree_check() and
* btree_gc_finish() have to run before we have buckets to
* allocate, and bch_bucket_alloc_set() might cause a journal
* entry to be written so bcache_journal_next() has to be called
* first.
*
* If the uuids were in the old format we have to rewrite them
* before the next journal entry is written:
*/
if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
bch_journal_replay(c, &journal);
} else {
pr_notice("invalidating existing data");
for_each_cache(ca, c, i) {
unsigned j;
ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
2, SB_JOURNAL_BUCKETS);
for (j = 0; j < ca->sb.keys; j++)
ca->sb.d[j] = ca->sb.first_bucket + j;
}
bch_initial_gc_finish(c);
err = "error starting allocator thread";
for_each_cache(ca, c, i)
if (bch_cache_allocator_start(ca))
goto err;
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
bch_prio_write(ca);
mutex_unlock(&c->bucket_lock);
err = "cannot allocate new UUID bucket";
if (__uuid_write(c))
goto err;
err = "cannot allocate new btree root";
c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
if (IS_ERR_OR_NULL(c->root))
goto err;
mutex_lock(&c->root->write_lock);
bkey_copy_key(&c->root->key, &MAX_KEY);
bch_btree_node_write(c->root, &cl);
mutex_unlock(&c->root->write_lock);
bch_btree_set_root(c->root);
rw_unlock(true, c->root);
/*
* We don't want to write the first journal entry until
* everything is set up - fortunately journal entries won't be
* written until the SET_CACHE_SYNC() here:
*/
SET_CACHE_SYNC(&c->sb, true);
bch_journal_next(&c->journal);
bch_journal_meta(c, &cl);
}
err = "error starting gc thread";
if (bch_gc_thread_start(c))
goto err;
closure_sync(&cl);
c->sb.last_mount = get_seconds();
bcache_write_super(c);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
bcache: fix for data collapse after re-attaching an attached device back-end device sdm has already attached a cache_set with ID f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with another cache set, and it returns with an error: [root]# cd /sys/block/sdm/bcache [root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach -bash: echo: write error: Invalid argument After that, execute a command to modify the label of bcache device: [root]# echo data_disk1 > label Then we reboot the system, when the system power on, the back-end device can not attach to cache_set, a messages show in the log: Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache: bch_cached_dev_attach() couldn't find uuid for sdm in set In sysfs_attach(), dc->sb.set_uuid was assigned to the value which input through sysfs, no matter whether it is success or not in bch_cached_dev_attach(). For example, If the back-end device has already attached to an cache set, bch_cached_dev_attach() would fail, but dc->sb.set_uuid was changed. Then modify the label of bcache device, it will call bch_write_bdev_super(), which would write the dc->sb.set_uuid to the super block, so we record a wrong cache set ID in the super block, after the system reboot, the cache set couldn't find the uuid of the back-end device, so the bcache device couldn't exist and use any more. In this patch, we don't assigned cache set ID to dc->sb.set_uuid in sysfs_attach() directly, but input it into bch_cached_dev_attach(), and assigned dc->sb.set_uuid to the cache set ID after the back-end device attached to the cache set successful. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:46 +00:00
bch_cached_dev_attach(dc, c, NULL);
flash_devs_run(c);
set_bit(CACHE_SET_RUNNING, &c->flags);
return;
err:
closure_sync(&cl);
/* XXX: test this, it's broken */
bch_cache_set_error(c, "%s", err);
}
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
{
return ca->sb.block_size == c->sb.block_size &&
ca->sb.bucket_size == c->sb.bucket_size &&
ca->sb.nr_in_set == c->sb.nr_in_set;
}
static const char *register_cache_set(struct cache *ca)
{
char buf[12];
const char *err = "cannot allocate memory";
struct cache_set *c;
list_for_each_entry(c, &bch_cache_sets, list)
if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
if (c->cache[ca->sb.nr_this_dev])
return "duplicate cache set member";
if (!can_attach_cache(ca, c))
return "cache sb does not match set";
if (!CACHE_SYNC(&ca->sb))
SET_CACHE_SYNC(&c->sb, false);
goto found;
}
c = bch_cache_set_alloc(&ca->sb);
if (!c)
return err;
err = "error creating kobject";
if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
kobject_add(&c->internal, &c->kobj, "internal"))
goto err;
if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
goto err;
bch_debug_init_cache_set(c);
list_add(&c->list, &bch_cache_sets);
found:
sprintf(buf, "cache%i", ca->sb.nr_this_dev);
if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
if (ca->sb.seq > c->sb.seq) {
c->sb.version = ca->sb.version;
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
c->sb.flags = ca->sb.flags;
c->sb.seq = ca->sb.seq;
pr_debug("set version = %llu", c->sb.version);
}
kobject_get(&ca->kobj);
ca->set = c;
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
if (c->caches_loaded == c->sb.nr_in_set)
run_cache_set(c);
return NULL;
err:
bch_cache_set_unregister(c);
return err;
}
/* Cache device */
void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
unsigned i;
if (ca->set) {
BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
ca->set->cache[ca->sb.nr_this_dev] = NULL;
}
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
free_heap(&ca->heap);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
put_page(bio_first_page_all(&ca->sb_bio));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
kfree(ca);
module_put(THIS_MODULE);
}
static int cache_alloc(struct cache *ca)
{
size_t free;
bcache: fix for allocator and register thread race After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache] [<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache] [<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache] [<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache] [<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache] [<ffffffff810a631f>] kthread+0xcf/0xe0 [<ffffffff8164c318>] ret_from_fork+0x58/0x90 [<ffffffffffffffff>] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache] [<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache] [<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache] [<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache] [<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache] [<ffffffff812f702f>] kobj_attr_store+0xf/0x20 [<ffffffff8125b216>] sysfs_write_file+0xc6/0x140 [<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0 [<ffffffff811e069f>] SyS_write+0x7f/0xe0 [<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui <huarui.dev@gmail.com> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:43 +00:00
size_t btree_buckets;
struct bucket *b;
__module_get(THIS_MODULE);
kobject_init(&ca->kobj, &bch_cache_ktype);
bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
bcache: fix for allocator and register thread race After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache] [<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache] [<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache] [<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache] [<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache] [<ffffffff810a631f>] kthread+0xcf/0xe0 [<ffffffff8164c318>] ret_from_fork+0x58/0x90 [<ffffffffffffffff>] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache] [<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache] [<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache] [<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache] [<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache] [<ffffffff812f702f>] kobj_attr_store+0xf/0x20 [<ffffffff8125b216>] sysfs_write_file+0xc6/0x140 [<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0 [<ffffffff811e069f>] SyS_write+0x7f/0xe0 [<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui <huarui.dev@gmail.com> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:43 +00:00
/*
* when ca->sb.njournal_buckets is not zero, journal exists,
* and in bch_journal_replay(), tree node may split,
* so bucket of RESERVE_BTREE type is needed,
* the worst situation is all journal buckets are valid journal,
* and all the keys need to replay,
* so the number of RESERVE_BTREE type buckets should be as much
* as journal buckets
*/
btree_buckets = ca->sb.njournal_buckets ?: 8;
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
bcache: fix for allocator and register thread race After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache] [<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache] [<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache] [<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache] [<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache] [<ffffffff810a631f>] kthread+0xcf/0xe0 [<ffffffff8164c318>] ret_from_fork+0x58/0x90 [<ffffffffffffffff>] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache] [<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache] [<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache] [<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache] [<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache] [<ffffffff812f702f>] kobj_attr_store+0xf/0x20 [<ffffffff8125b216>] sysfs_write_file+0xc6/0x140 [<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0 [<ffffffff811e069f>] SyS_write+0x7f/0xe0 [<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui <huarui.dev@gmail.com> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-07 19:41:43 +00:00
if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
bcache: RESERVE_PRIO is too small by one when prio_buckets() is a power of two. This patch fixes a cachedev registration-time allocation deadlock. This can deadlock on boot if your initrd auto-registeres bcache devices: Allocator thread: [ 720.727614] INFO: task bcache_allocato:3833 blocked for more than 120 seconds. [ 720.732361] [<ffffffff816eeac7>] schedule+0x37/0x90 [ 720.732963] [<ffffffffa05192b8>] bch_bucket_alloc+0x188/0x360 [bcache] [ 720.733538] [<ffffffff810e6950>] ? prepare_to_wait_event+0xf0/0xf0 [ 720.734137] [<ffffffffa05302bd>] bch_prio_write+0x19d/0x340 [bcache] [ 720.734715] [<ffffffffa05190bf>] bch_allocator_thread+0x3ff/0x470 [bcache] [ 720.735311] [<ffffffff816ee41c>] ? __schedule+0x2dc/0x950 [ 720.735884] [<ffffffffa0518cc0>] ? invalidate_buckets+0x980/0x980 [bcache] Registration thread: [ 720.710403] INFO: task bash:3531 blocked for more than 120 seconds. [ 720.715226] [<ffffffff816eeac7>] schedule+0x37/0x90 [ 720.715805] [<ffffffffa05235cd>] __bch_btree_map_nodes+0x12d/0x150 [bcache] [ 720.716409] [<ffffffffa0522d30>] ? bch_btree_insert_check_key+0x1c0/0x1c0 [bcache] [ 720.717008] [<ffffffffa05236e4>] bch_btree_insert+0xf4/0x170 [bcache] [ 720.717586] [<ffffffff810e6950>] ? prepare_to_wait_event+0xf0/0xf0 [ 720.718191] [<ffffffffa0527d9a>] bch_journal_replay+0x14a/0x290 [bcache] [ 720.718766] [<ffffffff810cc90d>] ? ttwu_do_activate.constprop.94+0x5d/0x70 [ 720.719369] [<ffffffff810cf684>] ? try_to_wake_up+0x1d4/0x350 [ 720.719968] [<ffffffffa05317d0>] run_cache_set+0x580/0x8e0 [bcache] [ 720.720553] [<ffffffffa053302e>] register_bcache+0xe2e/0x13b0 [bcache] [ 720.721153] [<ffffffff81354cef>] kobj_attr_store+0xf/0x20 [ 720.721730] [<ffffffff812a2dad>] sysfs_kf_write+0x3d/0x50 [ 720.722327] [<ffffffff812a225a>] kernfs_fop_write+0x12a/0x180 [ 720.722904] [<ffffffff81225177>] __vfs_write+0x37/0x110 [ 720.723503] [<ffffffff81228048>] ? __sb_start_write+0x58/0x110 [ 720.724100] [<ffffffff812cedb3>] ? security_file_permission+0x23/0xa0 [ 720.724675] [<ffffffff812258a9>] vfs_write+0xa9/0x1b0 [ 720.725275] [<ffffffff8102479c>] ? do_audit_syscall_entry+0x6c/0x70 [ 720.725849] [<ffffffff81226755>] SyS_write+0x55/0xd0 [ 720.726451] [<ffffffff8106a390>] ? do_page_fault+0x30/0x80 [ 720.727045] [<ffffffff816f2cae>] system_call_fastpath+0x12/0x71 The fifo code in upstream bcache can't use the last element in the buffer, which was the cause of the bug: if you asked for a power of two size, it'd give you a fifo that could hold one less than what you asked for rather than allocating a buffer twice as big. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Tested-by: Eric Wheeler <bcache@linux.ewheeler.net> Cc: stable@vger.kernel.org
2016-08-18 01:21:24 +00:00
!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
!init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
!(ca->buckets = vzalloc(sizeof(struct bucket) *
ca->sb.nbuckets)) ||
!(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
!(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
return -ENOMEM;
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
for_each_bucket(b, ca)
atomic_set(&b->pin, 0);
return 0;
}
static int register_cache(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev, struct cache *ca)
{
char name[BDEVNAME_SIZE];
const char *err = NULL; /* must be set for any error case */
int ret = 0;
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
bdevname(bdev, name);
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
ca->bdev->bd_holder = ca;
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
get_page(sb_page);
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
if (blk_queue_discard(bdev_get_queue(bdev)))
ca->discard = CACHE_DISCARD(&ca->sb);
ret = cache_alloc(ca);
if (ret != 0) {
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else
err = "cache_alloc(): unknown error";
goto err;
}
if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
err = "error calling kobject_add";
ret = -ENOMEM;
goto out;
}
mutex_lock(&bch_register_lock);
err = register_cache_set(ca);
mutex_unlock(&bch_register_lock);
if (err) {
ret = -ENODEV;
goto out;
}
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
pr_info("registered cache device %s", name);
out:
kobject_put(&ca->kobj);
err:
if (err)
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
pr_notice("error %s: %s", name, err);
return ret;
}
/* Global interfaces/init */
static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
const char *, size_t);
kobj_attribute_write(register, register_bcache);
kobj_attribute_write(register_quiet, register_bcache);
static bool bch_is_open_backing(struct block_device *bdev) {
struct cache_set *c, *tc;
struct cached_dev *dc, *t;
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
list_for_each_entry_safe(dc, t, &c->cached_devs, list)
if (dc->bdev == bdev)
return true;
list_for_each_entry_safe(dc, t, &uncached_devices, list)
if (dc->bdev == bdev)
return true;
return false;
}
static bool bch_is_open_cache(struct block_device *bdev) {
struct cache_set *c, *tc;
struct cache *ca;
unsigned i;
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
for_each_cache(ca, c, i)
if (ca->bdev == bdev)
return true;
return false;
}
static bool bch_is_open(struct block_device *bdev) {
return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
}
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
ssize_t ret = size;
const char *err = "cannot allocate memory";
char *path = NULL;
struct cache_sb *sb = NULL;
struct block_device *bdev = NULL;
struct page *sb_page = NULL;
if (!try_module_get(THIS_MODULE))
return -EBUSY;
if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
!(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
goto err;
err = "failed to open device";
bdev = blkdev_get_by_path(strim(path),
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
sb);
if (IS_ERR(bdev)) {
if (bdev == ERR_PTR(-EBUSY)) {
bdev = lookup_bdev(strim(path));
mutex_lock(&bch_register_lock);
if (!IS_ERR(bdev) && bch_is_open(bdev))
err = "device already registered";
else
err = "device busy";
mutex_unlock(&bch_register_lock);
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
goto out;
}
goto err;
}
err = "failed to set blocksize";
if (set_blocksize(bdev, 4096))
goto err_close;
err = read_super(sb, bdev, &sb_page);
if (err)
goto err_close;
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
err = "failed to register device";
if (SB_IS_BDEV(sb)) {
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
if (!dc)
goto err_close;
mutex_lock(&bch_register_lock);
register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto err_close;
if (register_cache(sb, sb_page, bdev, ca) != 0)
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
goto err;
}
out:
if (sb_page)
put_page(sb_page);
kfree(sb);
kfree(path);
module_put(THIS_MODULE);
return ret;
err_close:
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err:
bcache: fix crashes in duplicate cache device register Kernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE 4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:ffffa3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: 0000000000000000 RBX: ffff8c8f2f2f8000 RCX: ffffd6701f8 c7edf [ 417.643890] RDX: ffffa3aa9138fd88 RSI: ffffa3aa9138fd88 RDI: 00000000000 00000 [ 417.643895] RBP: ffffa3aa9138fde0 R08: ffffa3aa9138fae8 R09: 00000000000 1850e [ 417.643901] R10: ffff8c8eed34b271 R11: ffff8c8eed34b250 R12: 00000000000 00000 [ 417.643906] R13: ffffd6701f78f940 R14: ffff8c8f38f80000 R15: ffff8c8ea7d 90000 [ 417.643913] FS: 00007fde7e66f500(0000) GS:ffff8c8f61440000(0000) knlGS: 0000000000000000 [ 417.643919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.643925] CR2: 0000000000000314 CR3: 00000007e6fa0001 CR4: 00000000003 606e0 [ 417.643931] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000 00000 [ 417.643938] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 00000000000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:00007fff13009a38 EFLAGS: 00000246 ORIG_RAX: 0000000 000000001 [ 417.644183] RAX: ffffffffffffffda RBX: 0000000001658280 RCX: 00007fde7e1c 1974 [ 417.644188] RDX: 000000000000000a RSI: 0000000001658280 RDI: 000000000000 0001 [ 417.644193] RBP: 000000000000000a R08: 0000000000000003 R09: 000000000000 0077 [ 417.644198] R10: 000000000000089e R11: 0000000000000246 R12: 000000000000 0001 [ 417.644203] R13: 000000000000000a R14: 7fffffffffffffff R15: 000000000000 0000 [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: ffffa3aa9138fd38 [ 417.644306] CR2: 0000000000000314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reported-by: Marc MERLIN <marc@merlins.org> Tested-by: Michael Lyle <mlyle@lyle.org> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: <stable@vger.kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-05 21:41:54 +00:00
pr_info("error %s: %s", path, err);
ret = -EINVAL;
goto out;
}
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
{
if (code == SYS_DOWN ||
code == SYS_HALT ||
code == SYS_POWER_OFF) {
DEFINE_WAIT(wait);
unsigned long start = jiffies;
bool stopped = false;
struct cache_set *c, *tc;
struct cached_dev *dc, *tdc;
mutex_lock(&bch_register_lock);
if (list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices))
goto out;
pr_info("Stopping all devices:");
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
/* What's a condition variable? */
while (1) {
long timeout = start + 2 * HZ - jiffies;
stopped = list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices);
if (timeout < 0 || stopped)
break;
prepare_to_wait(&unregister_wait, &wait,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&bch_register_lock);
schedule_timeout(timeout);
mutex_lock(&bch_register_lock);
}
finish_wait(&unregister_wait, &wait);
if (stopped)
pr_info("All devices stopped");
else
pr_notice("Timeout waiting for devices to be closed");
out:
mutex_unlock(&bch_register_lock);
}
return NOTIFY_DONE;
}
static struct notifier_block reboot = {
.notifier_call = bcache_reboot,
.priority = INT_MAX, /* before any real devices */
};
static void bcache_exit(void)
{
bch_debug_exit();
bch_request_exit();
if (bcache_kobj)
kobject_put(bcache_kobj);
if (bcache_wq)
destroy_workqueue(bcache_wq);
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
mutex_destroy(&bch_register_lock);
}
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
NULL
};
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
closure_debug_init();
bcache_major = register_blkdev(0, "bcache");
if (bcache_major < 0) {
unregister_reboot_notifier(&reboot);
mutex_destroy(&bch_register_lock);
return bcache_major;
}
if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
!(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
bch_request_init() ||
bch_debug_init(bcache_kobj) ||
sysfs_create_files(bcache_kobj, files))
goto err;
return 0;
err:
bcache_exit();
return -ENOMEM;
}
module_exit(bcache_exit);
module_init(bcache_init);