mirror of
https://github.com/torvalds/linux.git
synced 2024-11-22 20:22:09 +00:00
bcachefs: Mark overwrites from journal replay in initial gc
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
d07343561e
commit
c6dd04f8f5
@ -273,11 +273,40 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
|
||||
(int) btree_id_to_gc_phase(r);
|
||||
}
|
||||
|
||||
static int mark_journal_key(struct bch_fs *c, enum btree_id id,
|
||||
struct bkey_i *insert)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct btree_iter *iter;
|
||||
struct bkey_s_c k;
|
||||
u8 max_stale;
|
||||
int ret = 0;
|
||||
|
||||
ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_trans_init(&trans, c);
|
||||
|
||||
for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k),
|
||||
BTREE_ITER_SLOTS, k) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL,
|
||||
BCH_BUCKET_MARK_GC|
|
||||
BCH_BUCKET_MARK_NOATOMIC);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (!ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return bch2_trans_exit(&trans);
|
||||
}
|
||||
|
||||
static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
|
||||
bool initial, bool metadata_only)
|
||||
{
|
||||
enum btree_id ids[BTREE_ID_NR];
|
||||
u8 max_stale;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
@ -299,9 +328,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
|
||||
|
||||
for_each_journal_key(*journal_keys, j)
|
||||
if (j->btree_id == id) {
|
||||
ret = bch2_gc_mark_key(c,
|
||||
bkey_i_to_s_c(j->k),
|
||||
&max_stale, initial);
|
||||
ret = mark_journal_key(c, id, j->k);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
@ -43,6 +43,7 @@ enum {
|
||||
__BTREE_INSERT_USE_ALLOC_RESERVE,
|
||||
__BTREE_INSERT_JOURNAL_REPLAY,
|
||||
__BTREE_INSERT_JOURNAL_RESERVED,
|
||||
__BTREE_INSERT_NOMARK_OVERWRITES,
|
||||
__BTREE_INSERT_NOMARK,
|
||||
__BTREE_INSERT_NOWAIT,
|
||||
__BTREE_INSERT_GC_LOCK_HELD,
|
||||
@ -76,6 +77,9 @@ enum {
|
||||
|
||||
#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
|
||||
|
||||
/* Don't mark overwrites, just new key: */
|
||||
#define BTREE_INSERT_NOMARK_OVERWRITES (1 << __BTREE_INSERT_NOMARK_OVERWRITES)
|
||||
|
||||
/* Don't call bch2_mark_key: */
|
||||
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
|
||||
|
||||
|
@ -542,20 +542,22 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
|
||||
|
||||
btree_trans_lock_write(c, trans);
|
||||
|
||||
trans_for_each_update_iter(trans, i) {
|
||||
if (i->deferred ||
|
||||
!btree_node_type_needs_gc(i->iter->btree_id))
|
||||
continue;
|
||||
if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
|
||||
trans_for_each_update_iter(trans, i) {
|
||||
if (i->deferred ||
|
||||
!btree_node_type_needs_gc(i->iter->btree_id))
|
||||
continue;
|
||||
|
||||
if (!fs_usage) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
fs_usage = bch2_fs_usage_scratch_get(c);
|
||||
}
|
||||
if (!fs_usage) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
fs_usage = bch2_fs_usage_scratch_get(c);
|
||||
}
|
||||
|
||||
if (!bch2_bkey_replicas_marked_locked(c,
|
||||
bkey_i_to_s_c(i->k), true)) {
|
||||
ret = BTREE_INSERT_NEED_MARK_REPLICAS;
|
||||
goto out;
|
||||
if (!bch2_bkey_replicas_marked_locked(c,
|
||||
bkey_i_to_s_c(i->k), true)) {
|
||||
ret = BTREE_INSERT_NEED_MARK_REPLICAS;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -602,16 +604,18 @@ static inline int do_btree_insert_at(struct btree_trans *trans,
|
||||
linked->flags |= BTREE_ITER_NOUNLOCK;
|
||||
}
|
||||
|
||||
trans_for_each_update_iter(trans, i)
|
||||
bch2_mark_update(trans, i, fs_usage, 0);
|
||||
if (fs_usage)
|
||||
bch2_trans_fs_usage_apply(trans, fs_usage);
|
||||
|
||||
if (unlikely(c->gc_pos.phase)) {
|
||||
if (likely(!(trans->flags & BTREE_INSERT_NOMARK))) {
|
||||
trans_for_each_update_iter(trans, i)
|
||||
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
|
||||
bch2_mark_update(trans, i, NULL,
|
||||
BCH_BUCKET_MARK_GC);
|
||||
bch2_mark_update(trans, i, fs_usage, 0);
|
||||
if (fs_usage)
|
||||
bch2_trans_fs_usage_apply(trans, fs_usage);
|
||||
|
||||
if (unlikely(c->gc_pos.phase)) {
|
||||
trans_for_each_update_iter(trans, i)
|
||||
if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
|
||||
bch2_mark_update(trans, i, NULL,
|
||||
BCH_BUCKET_MARK_GC);
|
||||
}
|
||||
}
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
|
@ -1035,6 +1035,56 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline bool bch2_mark_overwrite(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c old,
|
||||
struct bkey_i *new,
|
||||
struct bch_fs_usage *fs_usage,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree *b = iter->l[0].b;
|
||||
s64 sectors = 0;
|
||||
|
||||
if (btree_node_is_extents(b)
|
||||
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
|
||||
: bkey_cmp(new->k.p, old.k->p))
|
||||
return false;
|
||||
|
||||
if (btree_node_is_extents(b)) {
|
||||
switch (bch2_extent_overlap(&new->k, old.k)) {
|
||||
case BCH_EXTENT_OVERLAP_ALL:
|
||||
sectors = -((s64) old.k->size);
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_BACK:
|
||||
sectors = bkey_start_offset(&new->k) -
|
||||
old.k->p.offset;
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_FRONT:
|
||||
sectors = bkey_start_offset(old.k) -
|
||||
new->k.p.offset;
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_MIDDLE:
|
||||
sectors = old.k->p.offset - new->k.p.offset;
|
||||
BUG_ON(sectors <= 0);
|
||||
|
||||
bch2_mark_key_locked(c, old, true, sectors,
|
||||
fs_usage, trans->journal_res.seq,
|
||||
flags);
|
||||
|
||||
sectors = bkey_start_offset(&new->k) -
|
||||
old.k->p.offset;
|
||||
break;
|
||||
}
|
||||
|
||||
BUG_ON(sectors >= 0);
|
||||
}
|
||||
|
||||
bch2_mark_key_locked(c, old, false, sectors,
|
||||
fs_usage, trans->journal_res.seq, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_mark_update(struct btree_trans *trans,
|
||||
struct btree_insert_entry *insert,
|
||||
struct bch_fs_usage *fs_usage,
|
||||
@ -1049,57 +1099,23 @@ void bch2_mark_update(struct btree_trans *trans,
|
||||
if (!btree_node_type_needs_gc(iter->btree_id))
|
||||
return;
|
||||
|
||||
if (!(trans->flags & BTREE_INSERT_NOMARK))
|
||||
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
|
||||
bpos_min(insert->k->k.p, b->key.k.p).offset -
|
||||
bkey_start_offset(&insert->k->k),
|
||||
fs_usage, trans->journal_res.seq, flags);
|
||||
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
|
||||
bpos_min(insert->k->k.p, b->key.k.p).offset -
|
||||
bkey_start_offset(&insert->k->k),
|
||||
fs_usage, trans->journal_res.seq, flags);
|
||||
|
||||
if (unlikely(trans->flags & BTREE_INSERT_NOMARK_OVERWRITES))
|
||||
return;
|
||||
|
||||
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
|
||||
KEY_TYPE_discard))) {
|
||||
struct bkey unpacked;
|
||||
struct bkey_s_c k;
|
||||
s64 sectors = 0;
|
||||
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
|
||||
|
||||
k = bkey_disassemble(b, _k, &unpacked);
|
||||
|
||||
if (btree_node_is_extents(b)
|
||||
? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
|
||||
: bkey_cmp(insert->k->k.p, k.k->p))
|
||||
if (!bch2_mark_overwrite(trans, iter, k, insert->k,
|
||||
fs_usage, flags))
|
||||
break;
|
||||
|
||||
if (btree_node_is_extents(b)) {
|
||||
switch (bch2_extent_overlap(&insert->k->k, k.k)) {
|
||||
case BCH_EXTENT_OVERLAP_ALL:
|
||||
sectors = -((s64) k.k->size);
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_BACK:
|
||||
sectors = bkey_start_offset(&insert->k->k) -
|
||||
k.k->p.offset;
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_FRONT:
|
||||
sectors = bkey_start_offset(k.k) -
|
||||
insert->k->k.p.offset;
|
||||
break;
|
||||
case BCH_EXTENT_OVERLAP_MIDDLE:
|
||||
sectors = k.k->p.offset - insert->k->k.p.offset;
|
||||
BUG_ON(sectors <= 0);
|
||||
|
||||
bch2_mark_key_locked(c, k, true, sectors,
|
||||
fs_usage, trans->journal_res.seq,
|
||||
flags);
|
||||
|
||||
sectors = bkey_start_offset(&insert->k->k) -
|
||||
k.k->p.offset;
|
||||
break;
|
||||
}
|
||||
|
||||
BUG_ON(sectors >= 0);
|
||||
}
|
||||
|
||||
bch2_mark_key_locked(c, k, false, sectors,
|
||||
fs_usage, trans->journal_res.seq, flags);
|
||||
|
||||
bch2_btree_node_iter_advance(&node_iter, b);
|
||||
}
|
||||
}
|
||||
|
@ -254,6 +254,9 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
|
||||
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
|
||||
struct disk_reservation *);
|
||||
|
||||
bool bch2_mark_overwrite(struct btree_trans *, struct btree_iter *,
|
||||
struct bkey_s_c, struct bkey_i *,
|
||||
struct bch_fs_usage *, unsigned);
|
||||
void bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
|
||||
struct bch_fs_usage *, unsigned);
|
||||
void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *);
|
||||
|
@ -203,63 +203,94 @@ static void replay_now_at(struct journal *j, u64 seq)
|
||||
static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct btree_iter *iter;
|
||||
struct btree_iter *iter, *split_iter;
|
||||
/*
|
||||
* We might cause compressed extents to be
|
||||
* split, so we need to pass in a
|
||||
* disk_reservation:
|
||||
* We might cause compressed extents to be split, so we need to pass in
|
||||
* a disk_reservation:
|
||||
*/
|
||||
struct disk_reservation disk_res =
|
||||
bch2_disk_reservation_init(c, 0);
|
||||
BKEY_PADDED(k) split;
|
||||
struct bkey_i *split;
|
||||
bool split_compressed = false;
|
||||
unsigned flags = BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_LAZY_RW|
|
||||
BTREE_INSERT_JOURNAL_REPLAY|
|
||||
BTREE_INSERT_NOMARK;
|
||||
int ret;
|
||||
|
||||
bch2_trans_init(&trans, c);
|
||||
bch2_trans_preload_iters(&trans);
|
||||
retry:
|
||||
bch2_trans_begin(&trans);
|
||||
|
||||
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
|
||||
bkey_start_pos(&k->k),
|
||||
BTREE_ITER_INTENT);
|
||||
|
||||
do {
|
||||
ret = bch2_btree_iter_traverse(iter);
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
|
||||
bkey_copy(&split.k, k);
|
||||
bch2_cut_front(iter->pos, &split.k);
|
||||
bch2_extent_trim_atomic(&split.k, iter);
|
||||
split_iter = bch2_trans_copy_iter(&trans, iter);
|
||||
ret = PTR_ERR_OR_ZERO(split_iter);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_disk_reservation_add(c, &disk_res,
|
||||
split.k.k.size *
|
||||
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&split.k)),
|
||||
BCH_DISK_RESERVATION_NOFAIL);
|
||||
BUG_ON(ret);
|
||||
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
|
||||
ret = PTR_ERR_OR_ZERO(split);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &split.k));
|
||||
ret = bch2_trans_commit(&trans, &disk_res, NULL,
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_LAZY_RW|
|
||||
BTREE_INSERT_JOURNAL_REPLAY);
|
||||
} while ((!ret || ret == -EINTR) &&
|
||||
bkey_cmp(k->k.p, iter->pos));
|
||||
if (!split_compressed &&
|
||||
bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
|
||||
!bch2_extent_is_atomic(k, split_iter)) {
|
||||
ret = bch2_disk_reservation_add(c, &disk_res,
|
||||
k->k.size *
|
||||
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
|
||||
BCH_DISK_RESERVATION_NOFAIL);
|
||||
BUG_ON(ret);
|
||||
|
||||
flags &= ~BTREE_INSERT_JOURNAL_REPLAY;
|
||||
flags &= ~BTREE_INSERT_NOMARK;
|
||||
flags |= BTREE_INSERT_NOMARK_OVERWRITES;
|
||||
split_compressed = true;
|
||||
}
|
||||
|
||||
bkey_copy(split, k);
|
||||
bch2_cut_front(split_iter->pos, split);
|
||||
bch2_extent_trim_atomic(split, split_iter);
|
||||
|
||||
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
|
||||
bch2_btree_iter_set_pos(iter, split->k.p);
|
||||
} while (bkey_cmp(iter->pos, k->k.p) < 0);
|
||||
|
||||
ret = bch2_trans_commit(&trans, &disk_res, NULL, flags);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (split_compressed) {
|
||||
/*
|
||||
* This isn't strictly correct - we should only be relying on
|
||||
* the btree node lock for synchronization with gc when we've
|
||||
* got a write lock held.
|
||||
*
|
||||
* but - there are other correctness issues if btree gc were to
|
||||
* run before journal replay finishes
|
||||
*/
|
||||
BUG_ON(c->gc_pos.phase);
|
||||
|
||||
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
|
||||
NULL, 0, 0);
|
||||
}
|
||||
err:
|
||||
if (ret == -EINTR)
|
||||
goto retry;
|
||||
|
||||
bch2_disk_reservation_put(c, &disk_res);
|
||||
|
||||
/*
|
||||
* This isn't strictly correct - we should only be relying on the btree
|
||||
* node lock for synchronization with gc when we've got a write lock
|
||||
* held.
|
||||
*
|
||||
* but - there are other correctness issues if btree gc were to run
|
||||
* before journal replay finishes
|
||||
*/
|
||||
BUG_ON(c->gc_pos.phase);
|
||||
|
||||
bch2_mark_key(c, bkey_i_to_s_c(k), false, -((s64) k->k.size),
|
||||
NULL, 0, 0);
|
||||
bch2_trans_exit(&trans);
|
||||
|
||||
return ret;
|
||||
return bch2_trans_exit(&trans) ?: ret;
|
||||
}
|
||||
|
||||
static int bch2_journal_replay(struct bch_fs *c,
|
||||
|
Loading…
Reference in New Issue
Block a user