From 7a51608d0125469664e2daf8e060d6d783924c98 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Sep 2024 20:49:37 -0400 Subject: [PATCH] bcachefs: Rework btree node pinning In backpointers fsck, we do a seqential scan of one btree, and check references to another: extents <-> backpointers Checking references generates random lookups, so we want to pin that btree in memory (or only a range, if it doesn't fit in ram). Previously, this was done with a simple check in the shrinker - "if btree node is in range being pinned, don't free it" - but this generated OOMs, as our shrinker wasn't well behaved if there was less memory available than expected. Instead, we now have two different shrinkers and lru lists; the second shrinker being for pinned nodes, with seeks set much higher than normal - so they can still be freed if necessary, but we'll prefer not to. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 13 ++- fs/bcachefs/btree_cache.c | 164 +++++++++++++++++++--------- fs/bcachefs/btree_cache.h | 3 + fs/bcachefs/btree_types.h | 21 ++-- fs/bcachefs/btree_update_interior.c | 2 +- fs/bcachefs/journal_reclaim.c | 7 +- fs/bcachefs/sysfs.c | 15 ++- 7 files changed, 150 insertions(+), 75 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 6c395445f60c..b88d1801652c 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -752,10 +752,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; + bch2_btree_cache_unpin(c); + btree_interior_mask |= btree_leaf_mask; - c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; - c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; + c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; c->btree_cache.pinned_nodes_start = start; c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; @@ -777,6 +779,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, BBPOS(btree, b->key.k.p); break; } + bch2_node_pin(c, b); 0; })); } @@ -936,8 +939,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; @@ -1053,8 +1055,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e66853b98857..6e4afb2b5441 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -47,9 +47,14 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) c->btree_cache.nr_reserve = reserve; } -static inline size_t btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->nr_live + bc->nr_freeable - bc->nr_reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) @@ -184,6 +189,51 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + BUG_ON(!__btree_node_pinned(bc, b)); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) @@ -199,7 +249,7 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) if (b->c.btree_id < BTREE_ID_NR) --bc->nr_by_btree[b->c.btree_id]; - bc->nr_live--; + bc->live[btree_node_pinned(b)].nr--; bc->nr_freeable++; list_move(&b->list, &bc->freeable); } @@ -216,9 +266,14 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) if (b->c.btree_id < BTREE_ID_NR) bc->nr_by_btree[b->c.btree_id]++; - bc->nr_live++; + + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); + + list_move_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + bc->nr_freeable--; - list_move_tail(&b->list, &bc->live); return 0; } @@ -283,20 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b int ret = 0; lockdep_assert_held(&bc->lock); - - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = b->c.level - ? bc->pinned_nodes_interior_mask - : bc->pinned_nodes_leaf_mask; - - if ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) { - BTREE_CACHE_NOT_FREED_INCREMENT(pinned); - return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -401,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -410,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= - (bc->nr_live + bc->nr_freeable) * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -426,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); + can_free = btree_cache_can_free(list); nr = min_t(unsigned long, nr, can_free); i = 0; @@ -452,7 +493,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { @@ -476,7 +517,7 @@ restart: !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -490,8 +531,8 @@ restart: break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -504,40 +545,42 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; - unsigned i, flags; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_for_each_entry_safe(b, t, &bc->live, list) + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) bch2_btree_node_hash_remove(bc, b); list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -563,7 +606,8 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) BUG_ON(bc->nr_by_btree[i]); - BUG_ON(bc->nr_live); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); BUG_ON(bc->nr_freeable); if (bc->table_init_done) @@ -589,18 +633,28 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; @@ -611,7 +665,10 @@ err: void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -673,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b, false)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -1387,9 +1446,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_btree_cache_line(out, c, "nr_live:", bc->nr_live); - prt_btree_cache_line(out, c, "nr_freeable:", bc->nr_freeable); - prt_btree_cache_line(out, c, "nr dirty:", atomic_long_read(&bc->nr_dirty)); + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index f82064007127..367acd217c6a 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_node_pin(struct bch_fs *, struct btree *); +void bch2_btree_cache_unpin(struct bch_fs *); + void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index ee3df2a486cc..4568a41fefaf 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -147,8 +147,7 @@ struct btree { x(noevict) \ x(write_blocked) \ x(will_make_reachable) \ - x(access_bit) \ - x(pinned) \ + x(access_bit) enum bch_btree_cache_not_freed_reasons { #define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, @@ -157,6 +156,13 @@ enum bch_btree_cache_not_freed_reasons { BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, }; +struct btree_cache_list { + unsigned idx; + struct shrinker *shrink; + struct list_head list; + size_t nr; +}; + struct btree_cache { struct rhashtable table; bool table_init_done; @@ -174,12 +180,11 @@ struct btree_cache { * should never grow past ~2-3 nodes in practice. */ struct mutex lock; - struct list_head live; struct list_head freeable; struct list_head freed_pcpu; struct list_head freed_nonpcpu; + struct btree_cache_list live[2]; - size_t nr_live; size_t nr_freeable; size_t nr_reserve; size_t nr_by_btree[BTREE_ID_NR]; @@ -188,7 +193,6 @@ struct btree_cache { /* shrinker stats */ size_t nr_freed; u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; - struct shrinker *shrink; /* * If we need to allocate memory for a new btree node and that @@ -201,8 +205,8 @@ struct btree_cache { struct bbpos pinned_nodes_start; struct bbpos pinned_nodes_end; - u64 pinned_nodes_leaf_mask; - u64 pinned_nodes_interior_mask; + /* btree id mask: 0 for leaves, 1 for interior */ + u64 pinned_nodes_mask[2]; }; struct btree_node_iter { @@ -594,7 +598,8 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ - x(never_write) + x(never_write) \ + x(pinned) enum btree_flags { /* First bits for btree node write type */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 18494a662e0a..190bc1e81756 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1904,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * six_unlock_intent(&n->c.lock); mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live); + list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); mutex_unlock(&c->btree_cache.lock); bch2_trans_verify_locks(trans); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index f8e045982753..ace291f175dd 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_cache *bc = &c->btree_cache; bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; @@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; - if (atomic_long_read(&c->btree_cache.nr_dirty) * 2 > c->btree_cache.nr_live) + size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; + if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) min_nr = 1; min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); @@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - atomic_long_read(&c->btree_cache.nr_dirty), - c->btree_cache.nr_live, + atomic_long_read(&bc->nr_dirty), btree_cache_live, atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 6791540d6a4a..03e59f86f360 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -244,14 +244,18 @@ static struct attribute sysfs_state_rw = { static size_t bch2_btree_cache_size(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; size_t ret = 0; struct btree *b; - mutex_lock(&c->btree_cache.lock); - list_for_each_entry(b, &c->btree_cache.live, list) + mutex_lock(&bc->lock); + list_for_each_entry(b, &bc->live[0].list, list) ret += btree_buf_bytes(b); - - mutex_unlock(&c->btree_cache.lock); + list_for_each_entry(b, &bc->live[1].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->freeable, list) + ret += btree_buf_bytes(b); + mutex_unlock(&bc->lock); return ret; } @@ -444,11 +448,12 @@ STORE(bch2_fs) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { + struct btree_cache *bc = &c->btree_cache; struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); } if (attr == &sysfs_trigger_btree_key_cache_shrink) {