From 7561551e7ba870b9659083b95feb520fb2dacce3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Apr 2023 13:57:17 +0800 Subject: [PATCH 1/3] btrfs: scrub: try harder to mark RAID56 block groups read-only Currently we allow a block group not to be marked read-only for scrub. But for RAID56 block groups if we require the block group to be read-only, then we're allowed to use cached content from scrub stripe to reduce unnecessary RAID56 reads. So this patch would: - Make btrfs_inc_block_group_ro() try harder During my tests, for cases like btrfs/061 and btrfs/064, we can hit ENOSPC from btrfs_inc_block_group_ro() calls during scrub. The reason is if we only have one single data chunk, and trying to scrub it, we won't have any space left for any newer data writes. But this check should be done by the caller, especially for scrub cases we only temporarily mark the chunk read-only. And newer data writes would always try to allocate a new data chunk when needed. - Return error for scrub if we failed to mark a RAID56 chunk read-only Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 14 ++++++++++++-- fs/btrfs/scrub.c | 9 ++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 957ad1c31c4f..590b03560265 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2818,10 +2818,20 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, } ret = inc_block_group_ro(cache, 0); - if (!do_chunk_alloc || ret == -ETXTBSY) - goto unlock_out; if (!ret) goto out; + if (ret == -ETXTBSY) + goto unlock_out; + + /* + * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * chunk allocation storm to exhaust the system chunk array. Otherwise + * we still want to try our best to mark the block group read-only. + */ + if (!do_chunk_alloc && ret == -ENOSPC && + (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) + goto unlock_out; + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 836725a19661..dd37cba58022 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2518,13 +2518,20 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC && !sctx->is_dev_replace) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace && + !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. + * + * For RAID56 chunks, we have to mark them read-only + * for scrub, as later we would use our own cache + * out of RAID56 realm. + * Thus we want the RAID56 bg to be marked RO to + * prevent RMW from screwing up out cache. */ ro_set = 0; } else if (ret == -ETXTBSY) { From 806570c0bb7b4847828c22c4934fcf2dc8fc572f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 May 2023 13:58:13 +0200 Subject: [PATCH 2/3] btrfs: handle memory allocation failure in btrfs_csum_one_bio Since f8a53bb58ec7 ("btrfs: handle checksum generation in the storage layer") the failures of btrfs_csum_one_bio() are handled via bio_end_io(). This means, we can return BLK_STS_RESOURCE from btrfs_csum_one_bio() in case the allocation of the ordered sums fails. This also fixes a syzkaller report, where injecting a failure into the kvzalloc() call results in a BUG_ON(). Reported-by: syzbot+d8941552e21eac774778@syzkaller.appspotmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index cd4cce9ba443..d1cd0a692f8e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -792,7 +792,9 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - BUG_ON(!sums); /* -ENOMEM */ + if (!sums) + return BLK_STS_RESOURCE; + sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, offset); From 597441b3436a43011f31ce71dc0a6c0bf5ce958a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 11 May 2023 12:45:59 -0400 Subject: [PATCH 3/3] btrfs: use nofs when cleaning up aborted transactions Our CI system caught a lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.3.0-rc7+ #1167 Not tainted ------------------------------------------------------ kswapd0/46 is trying to acquire lock: ffff8c6543abd650 (sb_internal#2){++++}-{0:0}, at: btrfs_commit_inode_delayed_inode+0x5f/0x120 but task is already holding lock: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0xa5/0xe0 kmem_cache_alloc+0x31/0x2c0 alloc_extent_state+0x1d/0xd0 __clear_extent_bit+0x2e0/0x4f0 try_release_extent_mapping+0x216/0x280 btrfs_release_folio+0x2e/0x90 invalidate_inode_pages2_range+0x397/0x470 btrfs_cleanup_dirty_bgs+0x9e/0x210 btrfs_cleanup_one_transaction+0x22/0x760 btrfs_commit_transaction+0x3b7/0x13a0 create_subvol+0x59b/0x970 btrfs_mksubvol+0x435/0x4f0 __btrfs_ioctl_snap_create+0x11e/0x1b0 btrfs_ioctl_snap_create_v2+0xbf/0x140 btrfs_ioctl+0xa45/0x28f0 __x64_sys_ioctl+0x88/0xc0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc -> #0 (sb_internal#2){++++}-{0:0}: __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 start_transaction+0x401/0x730 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 kthread+0xf0/0x120 ret_from_fork+0x29/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(sb_internal#2); lock(fs_reclaim); lock(sb_internal#2); *** DEADLOCK *** 3 locks held by kswapd0/46: #0: ffffffffabe61b40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x4aa/0x7a0 #1: ffffffffabe50270 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x113/0x290 #2: ffff8c6543abd0e0 (&type->s_umount_key#44){++++}-{3:3}, at: super_cache_scan+0x38/0x1f0 stack backtrace: CPU: 0 PID: 46 Comm: kswapd0 Not tainted 6.3.0-rc7+ #1167 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack_lvl+0x58/0x90 check_noncircular+0xd6/0x100 ? save_trace+0x3f/0x310 ? add_lock_to_list+0x97/0x120 __lock_acquire+0x1435/0x21a0 lock_acquire+0xc2/0x2b0 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 start_transaction+0x401/0x730 ? btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_commit_inode_delayed_inode+0x5f/0x120 btrfs_evict_inode+0x292/0x3d0 ? lock_release+0x134/0x270 ? __pfx_wake_bit_function+0x10/0x10 evict+0xcc/0x1d0 inode_lru_isolate+0x14d/0x1e0 __list_lru_walk_one+0xbe/0x1c0 ? __pfx_inode_lru_isolate+0x10/0x10 ? __pfx_inode_lru_isolate+0x10/0x10 list_lru_walk_one+0x58/0x80 prune_icache_sb+0x39/0x60 super_cache_scan+0x161/0x1f0 do_shrink_slab+0x163/0x340 shrink_slab+0x1d3/0x290 shrink_node+0x300/0x720 balance_pgdat+0x35c/0x7a0 kswapd+0x205/0x410 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_kswapd+0x10/0x10 kthread+0xf0/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x29/0x50 This happens because when we abort the transaction in the transaction commit path we call invalidate_inode_pages2_range on our block group cache inodes (if we have space cache v1) and any delalloc inodes we may have. The plain invalidate_inode_pages2_range() call passes through GFP_KERNEL, which makes sense in most cases, but not here. Wrap these two invalidate callees with memalloc_nofs_save/memalloc_nofs_restore to make sure we don't end up with the fs reclaim dependency under the transaction dependency. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fbf9006c6234..6de6dcf2743e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4936,7 +4936,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); iput(inode); } spin_lock(&root->delalloc_lock); @@ -5042,7 +5046,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) inode = cache->io_ctl.inode; if (inode) { + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); invalidate_inode_pages2(inode->i_mapping); + memalloc_nofs_restore(nofs_flag); + BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode);