bcachefs: Rip out freelists from btree key cache

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-06-08 22:32:40 -04:00
parent d2ed0f206a
commit f2bfe7e837
3 changed files with 57 additions and 330 deletions

View File

@ -79,14 +79,24 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
static void bkey_cached_evict(struct btree_key_cache *c,
static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params);
if (ret) {
memset(&ck->key, ~0, sizeof(ck->key));
atomic_long_dec(&c->nr_keys);
}
atomic_long_dec(&c->nr_keys);
return ret;
}
static void __bkey_cached_free(struct rcu_head *rcu)
{
struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
kmem_cache_free(bch2_key_cache, ck);
}
static void bkey_cached_free(struct btree_key_cache *bc,
@ -94,115 +104,14 @@ static void bkey_cached_free(struct btree_key_cache *bc,
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
#ifdef __KERNEL__
static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
list_move(&ck->list, &pos->list);
return;
}
}
list_move(&ck->list, &bc->freed_nonpcpu);
}
#endif
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
if (!ck->c.lock.readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
bool freed = false;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr < ARRAY_SIZE(f->objs)) {
f->objs[f->nr++] = ck;
freed = true;
}
preempt_enable();
if (!freed) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
struct bkey_cached *ck2 = f->objs[--f->nr];
__bkey_cached_move_to_freelist_ordered(bc, ck2);
}
preempt_enable();
__bkey_cached_move_to_freelist_ordered(bc, ck);
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
static void bkey_cached_free_fast(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
list_del_init(&ck->list);
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
bkey_cached_move_to_freelist(bc, ck);
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
call_srcu(&c->btree_trans_barrier, &ck->rcu, __bkey_cached_free);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@ -222,78 +131,10 @@ static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
static struct bkey_cached *
bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
if (!pcpu_readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr)
ck = f->objs[--f->nr];
preempt_enable();
if (!ck) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (!list_empty(&bc->freed_nonpcpu) &&
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
ck = f->nr ? f->objs[--f->nr] : NULL;
preempt_enable();
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
if (ck) {
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
if (unlikely(ret)) {
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
btree_node_unlock(trans, path, 0);
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
return ck;
}
ck = allocate_dropping_locks(trans, ret,
struct bkey_cached *ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
if (ret) {
if (ck)
@ -305,7 +146,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
if (!ck)
return NULL;
INIT_LIST_HEAD(&ck->list);
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
ck->c.cached = true;
@ -322,21 +162,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
goto out;
if (bkey_cached_evict(c, ck))
goto out;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
mutex_unlock(&c->lock);
return ck;
}
@ -415,7 +255,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@ -611,8 +451,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
bkey_cached_free(&c->btree_key_cache, ck);
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@ -722,7 +566,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@ -735,48 +579,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
unsigned start, flags;
unsigned iter, start;
int srcu_idx;
mutex_lock(&bc->lock);
bc->requested_to_free += sc->nr_to_scan;
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
/*
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_nonpcpu--;
bc->freed++;
}
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_pcpu--;
bc->freed++;
}
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
@ -792,17 +602,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
}
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
start = bc->shrink_iter;
iter = bc->shrink_iter;
if (iter >= tbl->size)
iter = 0;
start = iter;
do {
struct rhash_head *pos, *next;
pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@ -812,29 +623,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
} else {
bkey_cached_evict(bc, ck);
} else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
bc->freed++;
freed++;
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
break;
goto out;
pos = next;
}
bc->shrink_iter++;
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
} while (scanned < nr && bc->shrink_iter != start);
iter++;
if (iter >= tbl->size)
iter = 0;
} while (scanned < nr && iter != start);
out:
bc->shrink_iter = iter;
rcu_read_unlock();
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
return freed;
}
@ -862,18 +675,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
struct bkey_cached *ck, *n;
struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
#ifdef __KERNEL__
int cpu;
#endif
shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
/*
* The loop is needed to guard against racing with rehash:
*/
@ -892,44 +700,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
bkey_cached_evict(bc, ck);
list_add(&ck->list, &items);
BUG_ON(!bkey_cached_evict(bc, ck));
kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
}
}
rcu_read_unlock();
}
#ifdef __KERNEL__
if (bc->pcpu_freed) {
for_each_possible_cpu(cpu) {
struct btree_key_cache_freelist *f =
per_cpu_ptr(bc->pcpu_freed, cpu);
for (i = 0; i < f->nr; i++) {
ck = f->objs[i];
list_add(&ck->list, &items);
}
}
}
#endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
mutex_unlock(&bc->lock);
list_for_each_entry_safe(ck, n, &items, list) {
cond_resched();
list_del(&ck->list);
kfree(ck->k);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
test_bit(BCH_FS_was_rw, &c->flags))
@ -942,15 +720,10 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
free_percpu(bc->pcpu_freed);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed_pcpu);
INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@ -958,12 +731,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
if (!bc->pcpu_freed)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@ -984,45 +751,19 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
unsigned flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
prt_printf(out, "\nshrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
struct bkey_cached *ck;
unsigned iter = 0;
list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
iter = 0;
list_for_each_entry(ck, &bc->freed_pcpu, list) {
prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
mutex_unlock(&bc->lock);
memalloc_flags_restore(flags);
}
void bch2_btree_key_cache_exit(void)

View File

@ -2,33 +2,19 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;

View File

@ -386,17 +386,17 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)