btrfs: avoid races when tracking progress for extent map shrinking

We store the progress (root and inode numbers) of the extent map shrinker
in fs_info without any synchronization but we can have multiple tasks
calling into the shrinker during memory allocations when there's enough
memory pressure for example.

This can result in a task A reading fs_info->extent_map_shrinker_last_ino
after another task B updates it, and task A reading
fs_info->extent_map_shrinker_last_root before task B updates it, making
task A see an odd state that isn't necessarily harmful but may make it
skip certain inode ranges or do more work than necessary by going over
the same inodes again. These unprotected accesses would also trigger
warnings from tools like KCSAN.

So add a lock to protect access to these progress fields.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Filipe Manana 2024-07-08 15:42:45 +01:00 committed by David Sterba
parent b3ebb9b7e9
commit 4484940514
4 changed files with 76 additions and 29 deletions

View File

@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
spin_lock_init(&fs_info->extent_map_shrinker_lock);
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;

View File

@ -1028,7 +1028,14 @@ out_free_pre:
return ret;
}
static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
struct btrfs_em_shrink_ctx {
long nr_to_scan;
long scanned;
u64 last_ino;
u64 last_root;
};
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@ -1075,7 +1082,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
em = rb_entry(node, struct extent_map, rb_node);
node = rb_next(node);
(*scanned)++;
ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@ -1096,7 +1103,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
if (*scanned >= nr_to_scan)
if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
@ -1115,22 +1122,21 @@ next:
return nr_dropped;
}
static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
ctx->last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
if (*scanned >= nr_to_scan)
if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
@ -1151,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
fs_info->extent_map_shrinker_last_ino = 0;
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
ctx->last_ino = 0;
ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@ -1166,23 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
u64 next_root_id = start_root_id;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
long scanned = 0;
ctx.scanned = 0;
ctx.nr_to_scan = nr_to_scan;
/*
* In case we have multiple tasks running this shrinker, make the next
* one start from the next inode in case it starts before we finish.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
fs_info->extent_map_shrinker_last_ino++;
ctx.last_root = fs_info->extent_map_shrinker_last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
start_root_id = ctx.last_root;
next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
nr, ctx.last_root,
ctx.last_ino);
}
/*
* We may be called from memory allocation paths, so we don't want to
* take too much time and slowdown tasks, so stop if we need reschedule.
*/
while (scanned < nr_to_scan && !need_resched()) {
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@ -1194,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
fs_info->extent_map_shrinker_last_root = 0;
fs_info->extent_map_shrinker_last_ino = 0;
ctx.last_root = 0;
ctx.last_ino = 0;
cycled = true;
continue;
}
@ -1209,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
/*
* In case of multiple tasks running this extent map shrinking code this
* isn't perfect but it's simple and silences things like KCSAN. It's
* not possible to know which task made more progress because we can
* cycle back to the first root and first inode if it's not the first
* time the shrinker ran, see the above logic. Also a task that started
* later may finish ealier than another task and made less progress. So
* make this simple and update to the progress of the last task that
* finished, with the occasional possiblity of having two consecutive
* runs of the shrinker process the same inodes.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
fs_info->extent_map_shrinker_last_root = ctx.last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
nr, ctx.last_root,
ctx.last_ino);
}
return nr_dropped;

View File

@ -630,6 +630,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;

View File

@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_to_scan, nr),
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
TP_STRUCT__entry_btrfs(
__field( long, nr_to_scan )
@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TP_fast_assign_btrfs(fs_info,
__entry->nr_to_scan = nr_to_scan;
__entry->nr = nr;
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
__entry->last_root_id = last_root_id;
__entry->last_ino = last_ino;
),
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_dropped, nr),
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
TP_STRUCT__entry_btrfs(
__field( long, nr_dropped )
@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_fast_assign_btrfs(fs_info,
__entry->nr_dropped = nr_dropped;
__entry->nr = nr;
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
__entry->last_root_id = last_root_id;
__entry->last_ino = last_ino;
),
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",