bcachefs: Rework btree node pinning

In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers

Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).

Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.

Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-09-04 20:49:37 -04:00
parent 91ddd71510
commit 7a51608d01
7 changed files with 150 additions and 75 deletions

View File

@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
bch2_btree_cache_unpin(c);
btree_interior_mask |= btree_leaf_mask;
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask;
c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
BBPOS(btree, b->key.k.p);
break;
}
bch2_node_pin(c, b);
0;
}));
}
@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;
@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch2_btree_cache_unpin(c);
bch_err_fn(c, ret);
return ret;

View File

@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c)
c->btree_cache.nr_reserve = reserve;
}
static inline size_t btree_cache_can_free(struct btree_cache *bc)
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
{
return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve);
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
size_t can_free = list->nr;
if (!list->idx)
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
return can_free;
}
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
{
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
return ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
}
void bch2_node_pin(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
mutex_lock(&bc->lock);
BUG_ON(!__btree_node_pinned(bc, b));
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
set_btree_node_pinned(b);
list_move(&b->list, &bc->live[1].list);
bc->live[0].nr--;
bc->live[1].nr++;
}
mutex_unlock(&bc->lock);
}
void bch2_btree_cache_unpin(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *n;
mutex_lock(&bc->lock);
c->btree_cache.pinned_nodes_mask[0] = 0;
c->btree_cache.pinned_nodes_mask[1] = 0;
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
clear_btree_node_pinned(b);
list_move(&b->list, &bc->live[0].list);
bc->live[0].nr++;
bc->live[1].nr--;
}
mutex_unlock(&bc->lock);
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
if (b->c.btree_id < BTREE_ID_NR)
--bc->nr_by_btree[b->c.btree_id];
bc->nr_live--;
bc->live[btree_node_pinned(b)].nr--;
bc->nr_freeable++;
list_move(&b->list, &bc->freeable);
}
@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
if (b->c.btree_id < BTREE_ID_NR)
bc->nr_by_btree[b->c.btree_id]++;
bc->nr_live++;
bool p = __btree_node_pinned(bc, b);
mod_bit(BTREE_NODE_pinned, &b->flags, p);
list_move_tail(&b->list, &bc->live[p].list);
bc->live[p].nr++;
bc->nr_freeable--;
list_move_tail(&b->list, &bc->live);
return 0;
}
@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b
int ret = 0;
lockdep_assert_held(&bc->lock);
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = b->c.level
? bc->pinned_nodes_interior_mask
: bc->pinned_nodes_leaf_mask;
if ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) {
BTREE_CACHE_NOT_FREED_INCREMENT(pinned);
return -BCH_ERR_ENOMEM_btree_node_reclaim;
}
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@ -401,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree_cache_list *list = shrink->private_data;
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free = 0;
@ -410,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long touched = 0;
unsigned i, flags;
unsigned long ret = SHRINK_STOP;
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >=
(bc->nr_live + bc->nr_freeable) * 3 / 4;
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
can_free = btree_cache_can_free(bc);
can_free = btree_cache_can_free(list);
nr = min_t(unsigned long, nr, can_free);
i = 0;
@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
}
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
list_for_each_entry_safe(b, t, &list->list, list) {
touched++;
if (btree_node_accessed(b)) {
@ -476,7 +517,7 @@ restart:
!btree_node_will_make_reachable(b) &&
!btree_node_write_blocked(b) &&
six_trylock_read(&b->c.lock)) {
list_move(&bc->live, &b->list);
list_move(&list->list, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
@ -490,8 +531,8 @@ restart:
break;
}
out_rotate:
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
if (&t->list != &list->list)
list_move_tail(&list->list, &t->list);
out:
mutex_unlock(&bc->lock);
out_nounlock:
@ -504,40 +545,42 @@ out_nounlock:
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree_cache_list *list = shrink->private_data;
if (bch2_btree_shrinker_disabled)
return 0;
return btree_cache_can_free(bc);
return btree_cache_can_free(list);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned i, flags;
unsigned long flags;
shrinker_free(bc->shrink);
shrinker_free(bc->live[1].shrink);
shrinker_free(bc->live[0].shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
list_move(&c->verify_data->list, &bc->live[0].list);
kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->b)
list_add(&r->b->list, &bc->live);
list_add(&r->b->list, &bc->live[0].list);
}
list_for_each_entry_safe(b, t, &bc->live, list)
list_for_each_entry_safe(b, t, &bc->live[1].list, list)
bch2_btree_node_hash_remove(bc, b);
list_for_each_entry_safe(b, t, &bc->live[0].list, list)
bch2_btree_node_hash_remove(bc, b);
list_for_each_entry_safe(b, t, &bc->freeable, list) {
@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
BUG_ON(bc->nr_by_btree[i]);
BUG_ON(bc->nr_live);
BUG_ON(bc->live[0].nr);
BUG_ON(bc->live[1].nr);
BUG_ON(bc->nr_freeable);
if (bc->table_init_done)
@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
if (!__bch2_btree_node_mem_alloc(c))
goto err;
list_splice_init(&bc->live, &bc->freeable);
list_splice_init(&bc->live[0].list, &bc->freeable);
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
bc->shrink = shrink;
bc->live[0].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
shrink->seeks = 4;
shrink->private_data = c;
shrink->seeks = 2;
shrink->private_data = &bc->live[0];
shrinker_register(shrink);
shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
if (!shrink)
goto err;
bc->live[1].shrink = shrink;
shrink->count_objects = bch2_btree_cache_count;
shrink->scan_objects = bch2_btree_cache_scan;
shrink->seeks = 8;
shrink->private_data = &bc->live[1];
shrinker_register(shrink);
return 0;
@ -611,7 +665,10 @@ err:
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
bc->live[i].idx = i;
INIT_LIST_HEAD(&bc->live[i].list);
}
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed_pcpu);
INIT_LIST_HEAD(&bc->freed_nonpcpu);
@ -673,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_reclaim(c, b, false))
return b;
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
if (!btree_node_reclaim(c, b, false))
return b;
while (1) {
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
/*
* Rare case: all nodes were intent-locked.
@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
prt_btree_cache_line(out, c, "nr_live:", bc->nr_live);
prt_btree_cache_line(out, c, "nr_freeable:", bc->nr_freeable);
prt_btree_cache_line(out, c, "nr dirty:", atomic_long_read(&bc->nr_dirty));
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
prt_newline(out);

View File

@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
void bch2_node_pin(struct bch_fs *, struct btree *);
void bch2_btree_cache_unpin(struct bch_fs *);
void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_i *);

View File

@ -147,8 +147,7 @@ struct btree {
x(noevict) \
x(write_blocked) \
x(will_make_reachable) \
x(access_bit) \
x(pinned) \
x(access_bit)
enum bch_btree_cache_not_freed_reasons {
#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n,
@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons {
BCH_BTREE_CACHE_NOT_FREED_REASONS_NR,
};
struct btree_cache_list {
unsigned idx;
struct shrinker *shrink;
struct list_head list;
size_t nr;
};
struct btree_cache {
struct rhashtable table;
bool table_init_done;
@ -174,12 +180,11 @@ struct btree_cache {
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
struct list_head live;
struct list_head freeable;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct btree_cache_list live[2];
size_t nr_live;
size_t nr_freeable;
size_t nr_reserve;
size_t nr_by_btree[BTREE_ID_NR];
@ -188,7 +193,6 @@ struct btree_cache {
/* shrinker stats */
size_t nr_freed;
u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR];
struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
@ -201,8 +205,8 @@ struct btree_cache {
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
u64 pinned_nodes_leaf_mask;
u64 pinned_nodes_interior_mask;
/* btree id mask: 0 for leaves, 1 for interior */
u64 pinned_nodes_mask[2];
};
struct btree_node_iter {
@ -594,7 +598,8 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
x(never_write)
x(never_write) \
x(pinned)
enum btree_flags {
/* First bits for btree node write type */

View File

@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live);
list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);

View File

@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j)
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_cache *bc = &c->btree_cache;
bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live)
size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr;
if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live)
min_nr = 1;
min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
atomic_long_read(&c->btree_cache.nr_dirty),
c->btree_cache.nr_live,
atomic_long_read(&bc->nr_dirty), btree_cache_live,
atomic_long_read(&c->btree_key_cache.nr_dirty),
atomic_long_read(&c->btree_key_cache.nr_keys));

View File

@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = {
static size_t bch2_btree_cache_size(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
size_t ret = 0;
struct btree *b;
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
mutex_lock(&bc->lock);
list_for_each_entry(b, &bc->live[0].list, list)
ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
list_for_each_entry(b, &bc->live[1].list, list)
ret += btree_buf_bytes(b);
list_for_each_entry(b, &bc->freeable, list)
ret += btree_buf_bytes(b);
mutex_unlock(&bc->lock);
return ret;
}
@ -444,11 +448,12 @@ STORE(bch2_fs)
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
struct btree_cache *bc = &c->btree_cache;
struct shrink_control sc;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc);
}
if (attr == &sysfs_trigger_btree_key_cache_shrink) {