bcachefs: BTREE_ITER_WITH_JOURNAL

This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is
automatically enabled when initializing a btree iterator before journal
replay has completed - it overlays the contents of the journal with the
btree.

This lets us delete bch2_btree_and_journal_walk() and just use the
normal btree iterator interface instead - which also lets us delete a
significant amount of duplicated code.

Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch -
we're redoing the binary search over keys in the journal every time we
call bch2_btree_iter_peek().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-12-25 20:07:00 -05:00 committed by Kent Overstreet
parent f28620c108
commit 5222a4607c
10 changed files with 344 additions and 366 deletions

View File

@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
#undef x #undef x
} }
static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca;
struct bucket *g;
struct bkey_alloc_unpacked u;
if (!bkey_is_alloc(k.k))
return 0;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
*bucket_gen(ca, k.k->p.offset) = u.gen;
g->_mark.gen = u.gen;
g->_mark.data_type = u.data_type;
g->_mark.dirty_sectors = u.dirty_sectors;
g->_mark.cached_sectors = u.cached_sectors;
g->_mark.stripe = u.stripe != 0;
g->stripe = u.stripe;
g->stripe_redundancy = u.stripe_redundancy;
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
return 0;
}
int bch2_alloc_read(struct bch_fs *c) int bch2_alloc_read(struct bch_fs *c)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
struct bucket *g;
struct bkey_alloc_unpacked u;
int ret; int ret;
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
down_read(&c->gc_lock); down_read(&c->gc_lock);
ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
if (!bkey_is_alloc(k.k))
continue;
ca = bch_dev_bkey_exists(c, k.k->p.inode);
g = bucket(ca, k.k->p.offset);
u = bch2_alloc_unpack(k);
*bucket_gen(ca, k.k->p.offset) = u.gen;
g->_mark.gen = u.gen;
g->_mark.data_type = u.data_type;
g->_mark.dirty_sectors = u.dirty_sectors;
g->_mark.cached_sectors = u.cached_sectors;
g->_mark.stripe = u.stripe != 0;
g->stripe = u.stripe;
g->stripe_redundancy = u.stripe_redundancy;
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
g->oldest_gen = u.oldest_gen;
g->gen_valid = 1;
}
bch2_trans_iter_exit(&trans, &iter);
up_read(&c->gc_lock); up_read(&c->gc_lock);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
if (ret) { if (ret) {
bch_err(c, "error reading alloc info: %i", ret); bch_err(c, "error reading alloc info: %i", ret);
return ret; return ret;

View File

@ -860,7 +860,6 @@ mempool_t bio_bounce_pages;
u64 reflink_hint; u64 reflink_hint;
reflink_gc_table reflink_gc_table; reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr; size_t reflink_gc_nr;
size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */ /* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset; struct bio_set writepage_bioset;

View File

@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c,
return 0; return 0;
} }
static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct reflink_gc *r;
const __le64 *refcount = bkey_refcount_c(k);
char buf[200];
int ret = 0;
if (!refcount)
return 0;
r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
if (!r)
return -ENOMEM;
if (!r ||
r->offset != k.k->p.offset ||
r->size != k.k->size) {
bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
return -EINVAL;
}
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
r->refcount)) {
struct bkey_i *new;
new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto fsck_err;
}
bkey_reassemble(new, k);
if (!r->refcount) {
new->k.type = KEY_TYPE_deleted;
new->k.size = 0;
} else {
*bkey_refcount(new) = cpu_to_le64(r->refcount);
}
ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
kfree(new);
}
fsck_err:
return ret;
}
static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only) bool metadata_only)
{ {
@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
if (initial) {
c->reflink_gc_idx = 0;
ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
bch2_gc_reflink_done_initial_fn);
goto out;
}
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) { BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k); const __le64 *refcount = bkey_refcount_c(k);
@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
if (!refcount) if (!refcount)
continue; continue;
r = genradix_ptr(&c->reflink_gc_table, idx); r = genradix_ptr(&c->reflink_gc_table, idx++);
if (!r || if (!r ||
r->offset != k.k->p.offset || r->offset != k.k->p.offset ||
r->size != k.k->size) { r->size != k.k->size) {
@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
else else
*bkey_refcount(new) = cpu_to_le64(r->refcount); *bkey_refcount(new) = cpu_to_le64(r->refcount);
ret = __bch2_trans_do(&trans, NULL, NULL, 0, ret = initial
? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
: __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_reflink, new)); __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
kfree(new); kfree(new);
@ -1466,64 +1407,21 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
} }
fsck_err: fsck_err:
bch2_trans_iter_exit(&trans, &iter); bch2_trans_iter_exit(&trans, &iter);
out:
c->reflink_gc_nr = 0; c->reflink_gc_nr = 0;
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
return ret; return ret;
} }
static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct gc_stripe *m;
const struct bch_stripe *s;
char buf[200];
unsigned i;
int ret = 0;
if (k.k->type != KEY_TYPE_stripe)
return 0;
s = bkey_s_c_to_stripe(k).v;
m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
for (i = 0; i < s->nr_blocks; i++)
if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
goto inconsistent;
return 0;
inconsistent:
if (fsck_err_on(true, c,
"stripe has wrong block sector count %u:\n"
" %s\n"
" should be %u", i,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
m ? m->block_sectors[i] : 0)) {
struct bkey_i_stripe *new;
new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto fsck_err;
}
bkey_reassemble(&new->k_i, k);
for (i = 0; i < new->v.nr_blocks; i++)
stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
kfree(new);
}
fsck_err:
return ret;
}
static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
bool metadata_only) bool metadata_only)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
struct gc_stripe *m;
const struct bch_stripe *s;
char buf[200];
unsigned i;
int ret = 0; int ret = 0;
if (metadata_only) if (metadata_only)
@ -1531,39 +1429,52 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
if (initial) { for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, BTREE_ITER_PREFETCH, k, ret) {
bch2_gc_stripes_done_initial_fn); if (k.k->type != KEY_TYPE_stripe)
} else { continue;
BUG();
s = bkey_s_c_to_stripe(k).v;
m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
for (i = 0; i < s->nr_blocks; i++)
if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
goto inconsistent;
continue;
inconsistent:
if (fsck_err_on(true, c,
"stripe has wrong block sector count %u:\n"
" %s\n"
" should be %u", i,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
m ? m->block_sectors[i] : 0)) {
struct bkey_i_stripe *new;
new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
break;
}
bkey_reassemble(&new->k_i, k);
for (i = 0; i < new->v.nr_blocks; i++)
stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
ret = initial
? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
: __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
kfree(new);
}
} }
fsck_err:
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
return ret; return ret;
} }
static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct reflink_gc *r;
const __le64 *refcount = bkey_refcount_c(k);
if (!refcount)
return 0;
r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
GFP_KERNEL);
if (!r)
return -ENOMEM;
r->offset = k.k->p.offset;
r->size = k.k->size;
r->refcount = 0;
return 0;
}
static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bool metadata_only) bool metadata_only)
{ {
@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
c->reflink_gc_nr = 0; c->reflink_gc_nr = 0;
if (initial) {
ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
bch2_gc_reflink_start_initial_fn);
goto out;
}
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) { BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k); const __le64 *refcount = bkey_refcount_c(k);
@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
r->refcount = 0; r->refcount = 0;
} }
bch2_trans_iter_exit(&trans, &iter); bch2_trans_iter_exit(&trans, &iter);
out:
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
return ret; return ret;
} }

View File

@ -12,6 +12,7 @@
#include "error.h" #include "error.h"
#include "extents.h" #include "extents.h"
#include "journal.h" #include "journal.h"
#include "recovery.h"
#include "replicas.h" #include "replicas.h"
#include "subvolume.h" #include "subvolume.h"
#include "trace.h" #include "trace.h"
@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
static void btree_path_verify_new_node(struct btree_trans *trans, static void btree_path_verify_new_node(struct btree_trans *trans,
struct btree_path *path, struct btree *b) struct btree_path *path, struct btree *b)
{ {
struct bch_fs *c = trans->c;
struct btree_path_level *l; struct btree_path_level *l;
unsigned plevel; unsigned plevel;
bool parent_locked; bool parent_locked;
@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return; return;
if (trans->journal_replay_not_finished)
return;
plevel = b->c.level + 1; plevel = b->c.level + 1;
if (!btree_path_node(path, plevel)) if (!btree_path_node(path, plevel))
return; return;
@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
char buf4[100]; char buf4[100];
struct bkey uk = bkey_unpack_key(b, k); struct bkey uk = bkey_unpack_key(b, k);
bch2_dump_btree_node(trans->c, l->b); bch2_dump_btree_node(c, l->b);
bch2_bpos_to_text(&PBUF(buf1), path->pos); bch2_bpos_to_text(&PBUF(buf1), path->pos);
bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
return ret; return ret;
} }
static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
struct btree_and_journal_iter *jiter)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
int ret = 0;
bch2_bkey_buf_init(&tmp);
while (nr && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
bch2_btree_and_journal_iter_advance(jiter);
k = bch2_btree_and_journal_iter_peek(jiter);
if (!k.k)
break;
bch2_bkey_buf_reassemble(&tmp, c, k);
ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
path->level - 1);
}
if (!was_locked)
btree_node_unlock(path, path->level);
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
unsigned plevel, struct btree *b) unsigned plevel, struct btree *b)
@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
btree_node_unlock(path, plevel); btree_node_unlock(path, plevel);
} }
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
struct bkey_buf *out)
{
struct bch_fs *c = trans->c;
struct btree_path_level *l = path_l(path);
struct btree_and_journal_iter jiter;
struct bkey_s_c k;
int ret = 0;
__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
bch2_bkey_buf_reassemble(out, c, k);
if (flags & BTREE_ITER_PREFETCH)
ret = btree_path_prefetch_j(trans, path, &jiter);
bch2_btree_and_journal_iter_exit(&jiter);
return ret;
}
static __always_inline int btree_path_down(struct btree_trans *trans, static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
unsigned flags, unsigned flags,
@ -1321,8 +1385,21 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp); bch2_bkey_buf_init(&tmp);
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b)); if (unlikely(trans->journal_replay_not_finished)) {
ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
if (ret)
goto err;
} else {
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
if (flags & BTREE_ITER_PREFETCH) {
ret = btree_path_prefetch(trans, path);
if (ret)
goto err;
}
}
b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b); ret = PTR_ERR_OR_ZERO(b);
@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
mark_btree_node_locked(path, level, lock_type); mark_btree_node_locked(path, level, lock_type);
btree_path_level_init(trans, path, b); btree_path_level_init(trans, path, b);
if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && if (likely(!trans->journal_replay_not_finished &&
tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k))) unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b); btree_node_mem_ptr_set(trans, path, level + 1, b);
if (flags & BTREE_ITER_PREFETCH)
ret = btree_path_prefetch(trans, path);
if (btree_node_read_locked(path, level + 1)) if (btree_node_read_locked(path, level + 1))
btree_node_unlock(path, level + 1); btree_node_unlock(path, level + 1);
path->level = level; path->level = level;
@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
return ret; return ret;
} }
static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
struct btree_path *path)
{
struct journal_keys *keys = &trans->c->journal_keys;
size_t idx = bch2_journal_key_search(keys, path->btree_id,
path->level, path->pos);
while (idx < keys->nr && keys->d[idx].overwritten)
idx++;
return (idx < keys->nr &&
keys->d[idx].btree_id == path->btree_id &&
keys->d[idx].level == path->level)
? keys->d[idx].k
: NULL;
}
static noinline
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);
if (k && !bpos_cmp(k->k.p, iter->pos)) {
iter->k = k->k;
return bkey_i_to_s_c(k);
} else {
return bkey_s_c_null;
}
}
static noinline
struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bkey_i *next_journal =
__btree_trans_peek_journal(trans, iter->path);
if (next_journal &&
bpos_cmp(next_journal->k.p,
k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
iter->k = next_journal->k;
k = bkey_i_to_s_c(next_journal);
}
return k;
}
/** /**
* bch2_btree_iter_peek: returns first key greater than or equal to iterator's * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
* current position * current position
@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out; goto out;
} }
next_update = btree_trans_peek_updates(iter);
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
/* * In the btree, deleted keys sort before non deleted: */ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
if (k.k && bkey_deleted(k.k) && k = btree_trans_peek_journal(trans, iter, k);
(!next_update ||
bpos_cmp(k.k->p, next_update->k.p) <= 0)) { next_update = btree_trans_peek_updates(iter);
search_key = k.k->p;
continue;
}
if (next_update && if (next_update &&
bpos_cmp(next_update->k.p, bpos_cmp(next_update->k.p,
@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
k = bkey_i_to_s_c(next_update); k = bkey_i_to_s_c(next_update);
} }
if (k.k && bkey_deleted(k.k)) {
/*
* If we've got a whiteout, and it's after the search
* key, advance the search key to the whiteout instead
* of just after the whiteout - it might be a btree
* whiteout, with a real key at the same position, since
* in the btree deleted keys sort before non deleted.
*/
search_key = bpos_cmp(search_key, k.k->p)
? k.k->p
: bpos_successor(k.k->p);
continue;
}
if (likely(k.k)) { if (likely(k.k)) {
/* /*
* We can never have a key in a leaf node at POS_MAX, so * We can never have a key in a leaf node at POS_MAX, so
@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
EBUG_ON(iter->path->cached || iter->path->level); EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
return bkey_s_c_err(-EIO);
bch2_btree_iter_verify(iter); bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify_entry_exit(iter);
@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update; struct bkey_i *next_update;
next_update = btree_trans_peek_updates(iter); if ((next_update = btree_trans_peek_updates(iter)) &&
if (next_update &&
!bpos_cmp(next_update->k.p, iter->pos)) { !bpos_cmp(next_update->k.p, iter->pos)) {
iter->k = next_update->k; iter->k = next_update->k;
k = bkey_i_to_s_c(next_update); k = bkey_i_to_s_c(next_update);
} else { goto out;
k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} }
if (!k.k || if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) (k = btree_trans_peek_slot_journal(trans, iter)).k)
? bpos_cmp(iter->pos, k.k->p) goto out;
: bkey_cmp(iter->pos, k.k->p))) {
bkey_init(&iter->k); k = bch2_btree_path_peek_slot(iter->path, &iter->k);
iter->k.p = iter->pos;
k = (struct bkey_s_c) { &iter->k, NULL };
}
} else { } else {
struct bpos next; struct bpos next;
@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = (struct bkey_s_c) { &iter->k, NULL }; k = (struct bkey_s_c) { &iter->k, NULL };
} }
} }
out:
iter->path->should_be_locked = true; iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify_entry_exit(iter);
@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_type_has_snapshots(btree_id)) btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS; flags |= BTREE_ITER_FILTER_SNAPSHOTS;
if (trans->journal_replay_not_finished)
flags |= BTREE_ITER_WITH_JOURNAL;
iter->trans = trans; iter->trans = trans;
iter->path = NULL; iter->path = NULL;
iter->btree_id = btree_id; iter->btree_id = btree_id;
@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
memset(trans, 0, sizeof(*trans)); memset(trans, 0, sizeof(*trans));
trans->c = c; trans->c = c;
trans->ip = _RET_IP_; trans->ip = _RET_IP_;
trans->journal_replay_not_finished =
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
bch2_trans_alloc_paths(trans, c); bch2_trans_alloc_paths(trans, c);

View File

@ -207,10 +207,11 @@ struct btree_node_iter {
#define BTREE_ITER_CACHED_NOFILL (1 << 8) #define BTREE_ITER_CACHED_NOFILL (1 << 8)
#define BTREE_ITER_CACHED_NOCREATE (1 << 9) #define BTREE_ITER_CACHED_NOCREATE (1 << 9)
#define BTREE_ITER_WITH_UPDATES (1 << 10) #define BTREE_ITER_WITH_UPDATES (1 << 10)
#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) #define BTREE_ITER_WITH_JOURNAL (1 << 11)
#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) #define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) #define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
#define BTREE_ITER_NOPRESERVE (1 << 14) #define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
#define BTREE_ITER_NOPRESERVE (1 << 15)
enum btree_path_uptodate { enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0, BTREE_ITER_UPTODATE = 0,
@ -381,6 +382,7 @@ struct btree_trans {
bool restarted:1; bool restarted:1;
bool paths_sorted:1; bool paths_sorted:1;
bool journal_transaction_names:1; bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
/* /*
* For when bch2_trans_update notices we'll be splitting a compressed * For when bch2_trans_update notices we'll be splitting a compressed
* extent: * extent:

View File

@ -16,6 +16,7 @@
#include "journal.h" #include "journal.h"
#include "journal_reclaim.h" #include "journal_reclaim.h"
#include "keylist.h" #include "keylist.h"
#include "recovery.h"
#include "replicas.h" #include "replicas.h"
#include "super-io.h" #include "super-io.h"
#include "trace.h" #include "trace.h"
@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert)); !btree_ptr_sectors_written(insert));
if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) { if (invalid) {

View File

@ -711,7 +711,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans); bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i) trans_for_each_update(trans, i)

View File

@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
bch2_stripes_heap_insert(c, m, iter.pos); bch2_stripes_heap_insert(c, m, iter.pos);
} }
static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
{
const struct bch_stripe *s;
struct bch_fs *c = trans->c;
struct stripe *m;
unsigned i;
int ret = 0;
if (k.k->type != KEY_TYPE_stripe)
return 0;
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
if (ret)
return ret;
s = bkey_s_c_to_stripe(k).v;
m = genradix_ptr(&c->stripes, k.k->p.offset);
m->alive = true;
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->blocks_nonempty = 0;
for (i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, k.k->p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
return ret;
}
int bch2_stripes_read(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter iter;
struct bkey_s_c k;
const struct bch_stripe *s;
struct stripe *m;
unsigned i;
int ret; int ret;
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
bch2_stripes_read_fn); for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
if (k.k->type != KEY_TYPE_stripe)
continue;
ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
if (ret)
break;
s = bkey_s_c_to_stripe(k).v;
m = genradix_ptr(&c->stripes, k.k->p.offset);
m->alive = true;
m->sectors = le16_to_cpu(s->sectors);
m->algorithm = s->algorithm;
m->nr_blocks = s->nr_blocks;
m->nr_redundant = s->nr_redundant;
m->blocks_nonempty = 0;
for (i = 0; i < s->nr_blocks; i++)
m->blocks_nonempty += !!stripe_blockcount_get(s, i);
spin_lock(&c->ec_stripes_heap_lock);
bch2_stripes_heap_update(c, m, k.k->p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
}
bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
if (ret) if (ret)
bch_err(c, "error reading stripes: %i", ret); bch_err(c, "error reading stripes: %i", ret);

View File

@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
static int __journal_key_cmp(enum btree_id l_btree_id, static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level, unsigned l_level,
struct bpos l_pos, struct bpos l_pos,
struct journal_key *r) const struct journal_key *r)
{ {
return (cmp_int(l_btree_id, r->btree_id) ?: return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?: cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p)); bpos_cmp(l_pos, r->k->k.p));
} }
static int journal_key_cmp(struct journal_key *l, struct journal_key *r) static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{ {
return (cmp_int(l->btree_id, r->btree_id) ?: return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p));
} }
static size_t journal_key_search(struct journal_keys *journal_keys, size_t bch2_journal_key_search(struct journal_keys *journal_keys,
enum btree_id id, unsigned level, enum btree_id id, unsigned level,
struct bpos pos) struct bpos pos)
{ {
size_t l = 0, r = journal_keys->nr, m; size_t l = 0, r = journal_keys->nr, m;
@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
}; };
struct journal_keys *keys = &c->journal_keys; struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter; struct journal_iter *iter;
unsigned idx = journal_key_search(keys, id, level, k->k.p); size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
BUG_ON(test_bit(BCH_FS_RW, &c->flags)); BUG_ON(test_bit(BCH_FS_RW, &c->flags));
@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
return 0; return 0;
} }
/*
* Can only be used from the recovery thread while we're still RO - can't be
* used once we've got RW, as journal_keys is at that point used by multiple
* threads:
*/
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_i *k) unsigned level, struct bkey_i *k)
{ {
@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos) unsigned level, struct bpos pos)
{ {
struct journal_keys *keys = &c->journal_keys; struct journal_keys *keys = &c->journal_keys;
size_t idx = journal_key_search(keys, btree, level, pos); size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->nr && if (idx < keys->nr &&
keys->d[idx].btree_id == btree && keys->d[idx].btree_id == btree &&
@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{ {
struct journal_key *k = iter->idx - iter->keys->nr struct journal_key *k = iter->keys->d + iter->idx;
? iter->keys->d + iter->idx : NULL;
if (k && while (k < iter->keys->d + iter->keys->nr &&
k->btree_id == iter->btree_id && k->btree_id == iter->btree_id &&
k->level == iter->level) k->level == iter->level) {
return k->k; if (!k->overwritten)
return k->k;
iter->idx++;
k = iter->keys->d + iter->idx;
}
iter->idx = iter->keys->nr;
return NULL; return NULL;
} }
@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->btree_id = id; iter->btree_id = id;
iter->level = level; iter->level = level;
iter->keys = &c->journal_keys; iter->keys = &c->journal_keys;
iter->idx = journal_key_search(&c->journal_keys, id, level, pos); iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
list_add(&iter->list, &c->journal_iters);
} }
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal); bch2_journal_iter_exit(&iter->journal);
} }
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c, struct bch_fs *c,
struct btree *b) struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{ {
memset(iter, 0, sizeof(*iter)); memset(iter, 0, sizeof(*iter));
iter->b = b; iter->b = b;
bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); iter->node_iter = node_iter;
bch2_journal_iter_init(c, &iter->journal, bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
b->c.btree_id, b->c.level, b->data->min_key); INIT_LIST_HEAD(&iter->journal.list);
} }
/* Walk btree, overlaying keys from the journal: */ /*
* this version is used by btree_gc before filesystem has gone RW and
static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, * multithreaded, so uses the journal_iters list:
struct btree_and_journal_iter iter) */
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c,
struct btree *b)
{ {
unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; struct btree_node_iter node_iter;
struct bkey_s_c k;
struct bkey_buf tmp;
BUG_ON(!b->c.level); bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
bch2_bkey_buf_init(&tmp); list_add(&iter->journal.list, &c->journal_iters);
while (i < nr &&
(k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
b->c.btree_id, b->c.level - 1);
bch2_btree_and_journal_iter_advance(&iter);
i++;
}
bch2_bkey_buf_exit(&tmp, c);
}
static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
enum btree_id btree_id,
btree_walk_key_fn key_fn)
{
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf tmp;
struct btree *child;
int ret = 0;
bch2_bkey_buf_init(&tmp);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (b->c.level) {
bch2_bkey_buf_reassemble(&tmp, c, k);
child = bch2_btree_node_get_noiter(c, tmp.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
btree_and_journal_iter_prefetch(c, b, iter);
ret = bch2_btree_and_journal_walk_recurse(trans, child,
btree_id, key_fn);
six_unlock_read(&child->c.lock);
} else {
ret = key_fn(trans, k);
}
if (ret)
break;
bch2_btree_and_journal_iter_advance(&iter);
}
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
btree_walk_key_fn key_fn)
{
struct bch_fs *c = trans->c;
struct btree *b = c->btree_roots[btree_id].b;
int ret = 0;
if (btree_node_fake(b))
return 0;
six_lock_read(&b->c.lock, NULL, NULL);
ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
six_unlock_read(&b->c.lock);
return ret;
} }
/* sort and dedup all keys in the journal: */ /* sort and dedup all keys in the journal: */
@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l; const struct journal_key *l = _l;
const struct journal_key *r = _r; const struct journal_key *r = _r;
return cmp_int(l->btree_id, r->btree_id) ?: return journal_key_cmp(l, r) ?:
cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset); cmp_int(l->journal_offset, r->journal_offset);
} }

View File

@ -31,6 +31,9 @@ struct btree_and_journal_iter {
} last; } last;
}; };
size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
unsigned, struct bpos);
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *); unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id, int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *, struct btree *,
struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *, struct bch_fs *,
struct btree *); struct btree *);
typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *); void bch2_journal_entries_free(struct list_head *);