bcachefs: Add a debug mode that always reads from every btree replica

There's a new module parameter, verify_all_btree_replicas, that enables
reading from every btree replica when reading in btree nodes and
comparing them against each other. We've been seeing some strange btree
corruption - this will hopefully aid in tracking it down and catching it
more often.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2021-05-21 23:57:37 -04:00 committed by Kent Overstreet
parent 596d3bdc1e
commit 1ce0cf5fe9
3 changed files with 274 additions and 8 deletions

View File

@ -264,7 +264,10 @@ do { \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
"done in memory")
"done in memory") \
BCH_DEBUG_PARAM(verify_all_btree_replicas, \
"When reading btree nodes, read all replicas and " \
"compare them")
/* Parameters that should only be compiled in in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() \

View File

@ -521,7 +521,7 @@ enum btree_validate_ret {
\
switch (write) { \
case READ: \
bch_err(c, "%s", _buf2); \
bch_err(c, "%s", _buf2); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
@ -1036,8 +1036,8 @@ static void btree_node_read_work(struct work_struct *work)
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
struct btree *b = rb->b;
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
char buf[200];
@ -1112,6 +1112,262 @@ static void btree_node_read_endio(struct bio *bio)
queue_work(system_unbound_wq, &rb->work);
}
struct btree_node_read_all {
struct closure cl;
struct bch_fs *c;
struct btree *b;
unsigned nr;
void *buf[BCH_REPLICAS_MAX];
struct bio *bio[BCH_REPLICAS_MAX];
int err[BCH_REPLICAS_MAX];
};
static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
{
struct btree_node *bn = data;
struct btree_node_entry *bne;
unsigned offset = 0;
if (le64_to_cpu(bn->magic) != bset_magic(c))
return 0;
while (offset < c->opts.btree_node_size) {
if (!offset) {
offset += vstruct_sectors(bn, c->block_bits);
} else {
bne = data + (offset << 9);
if (bne->keys.seq != bn->keys.seq)
break;
offset += vstruct_sectors(bne, c->block_bits);
}
}
return offset;
}
static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
{
struct btree_node *bn = data;
struct btree_node_entry *bne;
if (!offset)
return false;
while (offset < c->opts.btree_node_size) {
bne = data + (offset << 9);
if (bne->keys.seq == bn->keys.seq)
return true;
offset++;
}
return false;
return offset;
}
static void btree_node_read_all_replicas_done(struct closure *cl)
{
struct btree_node_read_all *ra =
container_of(cl, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
bool have_good_copy = false;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, write = READ;
unsigned i, written, written2;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
for (i = 0; i < ra->nr; i++) {
if (ra->err[i])
continue;
if (!have_good_copy) {
memcpy(b->data, ra->buf[i], btree_bytes(c));
have_good_copy = true;
written = btree_node_sectors_written(c, b->data);
}
/* Try to get the right btree node: */
if (have_good_copy &&
seq &&
b->data->keys.seq != seq &&
((struct btree_node *) ra->buf[i])->keys.seq == seq) {
memcpy(b->data, ra->buf[i], btree_bytes(c));
written = btree_node_sectors_written(c, b->data);
}
written2 = btree_node_sectors_written(c, ra->buf[i]);
if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
"found bset signature after last bset") ||
btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
"btree node replicas content mismatch"))
dump_bset_maps = true;
if (written2 > written) {
written = written2;
memcpy(b->data, ra->buf[i], btree_bytes(c));
}
}
fsck_err:
if (dump_bset_maps) {
for (i = 0; i < ra->nr; i++) {
char buf[200];
struct printbuf out = PBUF(buf);
struct btree_node *bn = ra->buf[i];
struct btree_node_entry *bne = NULL;
unsigned offset = 0, sectors;
bool gap = false;
if (ra->err[i])
continue;
while (offset < c->opts.btree_node_size) {
if (!offset) {
sectors = vstruct_sectors(bn, c->block_bits);
} else {
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq != bn->keys.seq)
break;
sectors = vstruct_sectors(bne, c->block_bits);
}
pr_buf(&out, " %u-%u", offset, offset + sectors);
if (bne && bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
pr_buf(&out, "*");
offset += sectors;
}
while (offset < c->opts.btree_node_size) {
bne = ra->buf[i] + (offset << 9);
if (bne->keys.seq == bn->keys.seq) {
if (!gap)
pr_buf(&out, " GAP");
gap = true;
sectors = vstruct_sectors(bne, c->block_bits);
pr_buf(&out, " %u-%u", offset, offset + sectors);
if (bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq), false))
pr_buf(&out, "*");
}
offset++;
}
bch_err(c, "replica %u:%s", i, buf);
}
}
if (have_good_copy)
bch2_btree_node_read_done(c, NULL, b, false);
else
set_btree_node_read_error(b);
for (i = 0; i < ra->nr; i++) {
mempool_free(ra->buf[i], &c->btree_bounce_pool);
bio_put(ra->bio[i]);
}
closure_debug_destroy(&ra->cl);
kfree(ra);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
static void btree_node_read_all_replicas_endio(struct bio *bio)
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
struct btree_node_read_all *ra = rb->ra;
if (rb->have_ioref) {
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
}
ra->err[rb->idx] = bio->bi_status;
closure_put(&ra->cl);
}
/*
* XXX This allocates multiple times from the same mempools, and can deadlock
* under sufficient memory pressure (but is only a debug path)
*/
static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
{
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded pick;
struct btree_node_read_all *ra;
unsigned i;
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
closure_init(&ra->cl, NULL);
ra->c = c;
ra->b = b;
ra->nr = bch2_bkey_nr_ptrs(k);
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
buf_pages(ra->buf[i], btree_bytes(c)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
}
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->ra = ra;
rb->start_time = local_clock();
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->idx = i;
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(&rb->bio));
bio_set_dev(&rb->bio, ca->disk_sb.bdev);
closure_get(&ra->cl);
submit_bio(&rb->bio);
} else {
ra->err[i] = BLK_STS_REMOVED;
}
i++;
}
if (sync) {
closure_sync(&ra->cl);
btree_node_read_all_replicas_done(&ra->cl);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq);
}
return 0;
}
void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
bool sync)
{
@ -1125,6 +1381,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b);
set_btree_node_read_in_flight(b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
return;
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick);
if (bch2_fs_fatal_err_on(ret <= 0, c,
@ -1143,17 +1405,16 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
&c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->b = b;
rb->ra = NULL;
rb->start_time = local_clock();
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
bio->bi_private = b;
bch2_bio_map(bio, b->data, btree_bytes(c));
set_btree_node_read_in_flight(b);
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
@ -1162,7 +1423,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) {
submit_bio_wait(bio);
bio->bi_private = b;
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
@ -1174,7 +1434,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_node_read_work(&rb->work);
else
queue_work(system_unbound_wq, &rb->work);
}
}

View File

@ -13,6 +13,7 @@ struct bch_fs;
struct btree_write;
struct btree;
struct btree_iter;
struct btree_node_read_all;
static inline bool btree_node_dirty(struct btree *b)
{
@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
struct btree_read_bio {
struct bch_fs *c;
struct btree *b;
struct btree_node_read_all *ra;
u64 start_time;
unsigned have_ioref:1;
unsigned idx:7;
struct extent_ptr_decoded pick;
struct work_struct work;
struct bio bio;