btrfs: make the extent map shrinker run asynchronously as a work queue job

Currently the extent map shrinker is run synchronously for kswapd tasks
that end up calling the fs shrinker (fs/super.c:super_cache_scan()).
This has some disadvantages and for some heavy workloads with memory
pressure it can cause some delays and stalls that make a machine
unresponsive for some periods. This happens because:

1) We can have several kswapd tasks on machines with multiple NUMA zones,
   and running the extent map shrinker concurrently can cause high
   contention on some spin locks, namely the spin locks that protect
   the radix tree that tracks roots, the per root xarray that tracks
   open inodes and the list of delayed iputs. This not only delays the
   shrinker but also causes high CPU consumption and makes the task
   running the shrinker monopolize a core, resulting in the symptoms
   of an unresponsive system. This was noted in previous commits such as
   commit ae1e766f62 ("btrfs: only run the extent map shrinker from
   kswapd tasks");

2) The extent map shrinker's iteration over inodes can often be slow, even
   after changing the data structure that tracks open inodes for a root
   from a red black tree (up to kernel 6.10) to an xarray (kernel 6.10+).
   The transition to the xarray while it made things a bit faster, it's
   still somewhat slow - for example in a test scenario with 10000 inodes
   that have no extent maps loaded, the extent map shrinker took between
   5ms to 8ms, using a release, non-debug kernel. Iterating over the
   extent maps of an inode can also be slow if have an inode with many
   thousands of extent maps, since we use a red black tree to track and
   search extent maps. So having the extent map shrinker run synchronously
   adds extra delay for other things a kswapd task does.

So make the extent map shrinker run asynchronously as a job for the
system unbounded workqueue, just like what we do for data and metadata
space reclaim jobs.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Filipe Manana 2024-08-29 15:23:32 +01:00 committed by David Sterba
parent 03ba050583
commit 1020443840
5 changed files with 52 additions and 19 deletions

View File

@ -2785,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@ -4292,6 +4293,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->extent_map_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);

View File

@ -1124,7 +1124,8 @@ struct btrfs_em_shrink_ctx {
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
@ -1197,7 +1198,8 @@ next:
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
if (need_resched() || rwlock_needbreak(&tree->lock))
if (need_resched() || rwlock_needbreak(&tree->lock) ||
btrfs_fs_closing(fs_info))
break;
node = next;
}
@ -1221,7 +1223,8 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
ctx->last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
if (ctx->scanned >= ctx->nr_to_scan)
if (ctx->scanned >= ctx->nr_to_scan ||
btrfs_fs_closing(inode->root->fs_info))
break;
cond_resched();
@ -1250,16 +1253,19 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
return nr_dropped;
}
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
fs_info = container_of(work, struct btrfs_fs_info, extent_map_shrinker_work);
ctx.scanned = 0;
ctx.nr_to_scan = nr_to_scan;
ctx.nr_to_scan = atomic64_read(&fs_info->extent_map_shrinker_nr_to_scan);
/*
* In case we have multiple tasks running this shrinker, make the next
@ -1277,12 +1283,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, ctx.nr_to_scan,
nr, ctx.last_root,
ctx.last_ino);
}
while (ctx.scanned < ctx.nr_to_scan) {
while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@ -1340,5 +1346,34 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
ctx.last_ino);
}
return nr_dropped;
atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
}
void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
/*
* Do nothing if the shrinker is already running. In case of high memory
* pressure we can have a lot of tasks calling us and all passing the
* same nr_to_scan value, but in reality we may need only to free
* nr_to_scan extent maps (or less). In case we need to free more than
* that, we will be called again by the fs shrinker, so no worries about
* not doing enough work to reclaim memory from extent maps.
* We can also be repeatedly called with the same nr_to_scan value
* simply because the shrinker runs asynchronously and multiple calls
* to this function are made before the shrinker does enough progress.
*
* That's why we set the atomic counter to nr_to_scan only if its
* current value is zero, instead of incrementing the counter by
* nr_to_scan.
*/
if (atomic64_cmpxchg(&fs_info->extent_map_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
queue_work(system_unbound_wq, &fs_info->extent_map_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
{
atomic64_set(&fs_info->extent_map_shrinker_nr_to_scan, 0);
INIT_WORK(&fs_info->extent_map_shrinker_work, btrfs_extent_map_shrinker_worker);
}

View File

@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif

View File

@ -638,6 +638,8 @@ struct btrfs_fs_info {
spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;
atomic64_t extent_map_shrinker_nr_to_scan;
struct work_struct extent_map_shrinker_work;
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;

View File

@ -28,7 +28,6 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@ -2408,16 +2407,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
/*
* We may be called from any task trying to allocate memory and we don't
* want to slow it down with scanning and dropping extent maps. It would
* also cause heavy lock contention if many tasks concurrently enter
* here. Therefore only allow kswapd tasks to scan and drop extent maps.
*/
if (!current_is_kswapd())
return 0;
btrfs_free_extent_maps(fs_info, nr_to_scan);
return btrfs_free_extent_maps(fs_info, nr_to_scan);
/* The extent map shrinker runs asynchronously, so always return 0. */
return 0;
}
static const struct super_operations btrfs_super_ops = {