bcachefs: Mark overwrites from journal replay in initial gc

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2019-04-15 14:58:00 -04:00 committed by Kent Overstreet
parent d07343561e
commit c6dd04f8f5
6 changed files with 191 additions and 106 deletions

View File

@ -273,11 +273,40 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
(int) btree_id_to_gc_phase(r);
}
static int mark_journal_key(struct bch_fs *c, enum btree_id id,
struct bkey_i *insert)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u8 max_stale;
int ret = 0;
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
if (ret)
return ret;
bch2_trans_init(&trans, c);
for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
BTREE_ITER_SLOTS, k) {
percpu_down_read(&c->mark_lock);
ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
BCH_BUCKET_MARK_GC|
BCH_BUCKET_MARK_NOATOMIC);
percpu_up_read(&c->mark_lock);
if (!ret)
break;
}
return bch2_trans_exit(&trans);
}
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
enum btree_id ids[BTREE_ID_NR];
u8 max_stale;
unsigned i;
for (i = 0; i < BTREE_ID_NR; i++)
@ -299,9 +328,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
for_each_journal_key(*journal_keys, j)
if (j->btree_id == id) {
ret = bch2_gc_mark_key(c,
bkey_i_to_s_c(j->k),
&max_stale, initial);
ret = mark_journal_key(c, id, j->k);
if (ret)
return ret;
}

View File

@ -43,6 +43,7 @@ enum {
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_NOMARK_OVERWRITES,
__BTREE_INSERT_NOMARK,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
@ -76,6 +77,9 @@ enum {
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
/* Don't mark overwrites, just new key: */
#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
/* Don't call bch2_mark_key: */
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)

View File

@ -542,20 +542,22 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
btree_trans_lock_write(c, trans);
trans_for_each_update_iter(trans, i) {
if (i->deferred ||
!btree_node_type_needs_gc(i->iter->btree_id))
continue;
if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
trans_for_each_update_iter(trans, i) {
if (i->deferred ||
!btree_node_type_needs_gc(i->iter->btree_id))
continue;
if (!fs_usage) {
percpu_down_read(&c->mark_lock);
fs_usage = bch2_fs_usage_scratch_get(c);
}
if (!fs_usage) {
percpu_down_read(&c->mark_lock);
fs_usage = bch2_fs_usage_scratch_get(c);
}
if (!bch2_bkey_replicas_marked_locked(c,
bkey_i_to_s_c(i->k), true)) {
ret = BTREE_INSERT_NEED_MARK_REPLICAS;
goto out;
if (!bch2_bkey_replicas_marked_locked(c,
bkey_i_to_s_c(i->k), true)) {
ret = BTREE_INSERT_NEED_MARK_REPLICAS;
goto out;
}
}
}
@ -602,16 +604,18 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
linked->flags |= BTREE_ITER_NOUNLOCK;
}
trans_for_each_update_iter(trans, i)
bch2_mark_update(trans, i, fs_usage, 0);
if (fs_usage)
bch2_trans_fs_usage_apply(trans, fs_usage);
if (unlikely(c->gc_pos.phase)) {
if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
trans_for_each_update_iter(trans, i)
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
bch2_mark_update(trans, i, NULL,
BCH_BUCKET_MARK_GC);
bch2_mark_update(trans, i, fs_usage, 0);
if (fs_usage)
bch2_trans_fs_usage_apply(trans, fs_usage);
if (unlikely(c->gc_pos.phase)) {
trans_for_each_update_iter(trans, i)
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
bch2_mark_update(trans, i, NULL,
BCH_BUCKET_MARK_GC);
}
}
trans_for_each_update(trans, i)

View File

@ -1035,6 +1035,56 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
return ret;
}
inline bool bch2_mark_overwrite(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c old,
struct bkey_i *new,
struct bch_fs_usage *fs_usage,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
s64 sectors = 0;
if (btree_node_is_extents(b)
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
return false;
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
sectors = -((s64) old.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
sectors = bkey_start_offset(&new->k) -
old.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
sectors = bkey_start_offset(old.k) -
new->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
sectors = old.k->p.offset - new->k.p.offset;
BUG_ON(sectors <= 0);
bch2_mark_key_locked(c, old, true, sectors,
fs_usage, trans->journal_res.seq,
flags);
sectors = bkey_start_offset(&new->k) -
old.k->p.offset;
break;
}
BUG_ON(sectors >= 0);
}
bch2_mark_key_locked(c, old, false, sectors,
fs_usage, trans->journal_res.seq, flags);
return true;
}
void bch2_mark_update(struct btree_trans *trans,
struct btree_insert_entry *insert,
struct bch_fs_usage *fs_usage,
@ -1049,57 +1099,23 @@ void bch2_mark_update(struct btree_trans *trans,
if (!btree_node_type_needs_gc(iter->btree_id))
return;
if (!(trans->flags & BTREE_INSERT_NOMARK))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
fs_usage, trans->journal_res.seq, flags);
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
fs_usage, trans->journal_res.seq, flags);
if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
return;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
s64 sectors = 0;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)
? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(insert->k->k.p, k.k->p))
if (!bch2_mark_overwrite(trans, iter, k, insert->k,
fs_usage, flags))
break;
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&insert->k->k, k.k)) {
case BCH_EXTENT_OVERLAP_ALL:
sectors = -((s64) k.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
sectors = bkey_start_offset(&insert->k->k) -
k.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
sectors = bkey_start_offset(k.k) -
insert->k->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
sectors = k.k->p.offset - insert->k->k.p.offset;
BUG_ON(sectors <= 0);
bch2_mark_key_locked(c, k, true, sectors,
fs_usage, trans->journal_res.seq,
flags);
sectors = bkey_start_offset(&insert->k->k) -
k.k->p.offset;
break;
}
BUG_ON(sectors >= 0);
}
bch2_mark_key_locked(c, k, false, sectors,
fs_usage, trans->journal_res.seq, flags);
bch2_btree_node_iter_advance(&node_iter, b);
}
}

View File

@ -254,6 +254,9 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *);
bool bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
struct bkey_s_c, struct bkey_i *,
struct bch_fs_usage *, unsigned);
void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
struct bch_fs_usage *, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);

View File

@ -203,63 +203,94 @@ static void replay_now_at(struct journal *j, u64 seq)
static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter;
struct btree_iter *iter, *split_iter;
/*
* We might cause compressed extents to be
* split, so we need to pass in a
* disk_reservation:
* We might cause compressed extents to be split, so we need to pass in
* a disk_reservation:
*/
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
BKEY_PADDED(k) split;
struct bkey_i *split;
bool split_compressed = false;
unsigned flags = BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK;
int ret;
bch2_trans_init(&trans, c);
bch2_trans_preload_iters(&trans);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
do {
ret = bch2_btree_iter_traverse(iter);
if (ret)
break;
goto err;
bkey_copy(&split.k, k);
bch2_cut_front(iter->pos, &split.k);
bch2_extent_trim_atomic(&split.k, iter);
split_iter = bch2_trans_copy_iter(&trans, iter);
ret = PTR_ERR_OR_ZERO(split_iter);
if (ret)
goto err;
ret = bch2_disk_reservation_add(c, &disk_res,
split.k.k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
goto err;
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
ret = bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
} while ((!ret || ret == -EINTR) &&
bkey_cmp(k->k.p, iter->pos));
if (!split_compressed &&
bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
!bch2_extent_is_atomic(k, split_iter)) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
flags &= ~BTREE_INSERT_JOURNAL_REPLAY;
flags &= ~BTREE_INSERT_NOMARK;
flags |= BTREE_INSERT_NOMARK_OVERWRITES;
split_compressed = true;
}
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
bch2_extent_trim_atomic(split, split_iter);
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
bch2_btree_iter_set_pos(iter, split->k.p);
} while (bkey_cmp(iter->pos, k->k.p) < 0);
ret = bch2_trans_commit(&trans, &disk_res, NULL, flags);
if (ret)
goto err;
if (split_compressed) {
/*
* This isn't strictly correct - we should only be relying on
* the btree node lock for synchronization with gc when we've
* got a write lock held.
*
* but - there are other correctness issues if btree gc were to
* run before journal replay finishes
*/
BUG_ON(c->gc_pos.phase);
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
NULL, 0, 0);
}
err:
if (ret == -EINTR)
goto retry;
bch2_disk_reservation_put(c, &disk_res);
/*
* This isn't strictly correct - we should only be relying on the btree
* node lock for synchronization with gc when we've got a write lock
* held.
*
* but - there are other correctness issues if btree gc were to run
* before journal replay finishes
*/
BUG_ON(c->gc_pos.phase);
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
NULL, 0, 0);
bch2_trans_exit(&trans);
return ret;
return bch2_trans_exit(&trans) ?: ret;
}
static int bch2_journal_replay(struct bch_fs *c,