From c965d6402f24adcd3d4d0dd9b1f30a0578b6255c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn <johannes.thumshirn@wdc.com> Date: Tue, 28 Jul 2020 20:25:41 +0900 Subject: [PATCH 1/6] btrfs: handle errors from async submission Btrfs' async submit mechanism is able to handle errors in the submission path and the meta-data async submit function correctly passes the error code to the caller. In btrfs_submit_bio_start() and btrfs_submit_bio_start_direct_io() we're not handling the errors returned by btrfs_csum_one_bio() correctly though and simply call BUG_ON(). This is unnecessary as the caller of these two functions - run_one_async_start - correctly checks for the return values and sets the status of the async_submit_bio. The actual bio submission will be handled later on by run_one_async_done only if async_submit_bio::status is 0, so the data won't be written if we encountered an error in the checksum process. Simply return the error from btrfs_csum_one_bio() to the async submitters, like it's done in btree_submit_bio_start(). Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/inode.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16ce82039dcf..4ad0323b3684 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2160,11 +2160,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { struct inode *inode = private_data; - blk_status_t ret = 0; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); - BUG_ON(ret); /* -ENOMEM */ - return 0; + return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } /* @@ -7618,10 +7615,8 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { struct inode *inode = private_data; - blk_status_t ret; - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); - BUG_ON(ret); /* -ENOMEM */ - return 0; + + return btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); } static void btrfs_end_dio_bio(struct bio *bio) From 282dd7d7718444679b046b769d872b188818ca35 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza <mpdesouza@suse.com> Date: Mon, 3 Aug 2020 16:55:01 -0300 Subject: [PATCH 2/6] btrfs: reset compression level for lzo on remount Currently a user can set mount "-o compress" which will set the compression algorithm to zlib, and use the default compress level for zlib (3): relatime,compress=zlib:3,space_cache If the user remounts the fs using "-o compress=lzo", then the old compress_level is used: relatime,compress=lzo:3,space_cache But lzo does not expose any tunable compression level. The same happens if we set any compress argument with different level, also with zstd. Fix this by resetting the compress_level when compress=lzo is specified. With the fix applied, lzo is shown without compress level: relatime,compress=lzo,space_cache CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e529ddb35b87..25967ecaaf0a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -625,6 +625,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } else if (strncmp(args[0].from, "lzo", 3) == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; + info->compress_level = 0; btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); From 604997b4a3803f33e70799b2696bd3dbfed7276c Mon Sep 17 00:00:00 2001 From: David Sterba <dsterba@suse.com> Date: Mon, 27 Jul 2020 17:38:19 +0200 Subject: [PATCH 3/6] btrfs: use the correct const function attribute for btrfs_get_num_csums The build robot reports compiler: h8300-linux-gcc (GCC) 9.3.0 In file included from fs/btrfs/tests/extent-map-tests.c:8: >> fs/btrfs/tests/../ctree.h:2166:8: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 2166 | size_t __const btrfs_get_num_csums(void); | ^~~~~~~ The function attribute for const does not follow the expected scheme and in this case is confused with a const type qualifier. Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/ctree.c | 2 +- fs/btrfs/ctree.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 70e49d8d4f6c..cd1cd673bc0b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -68,7 +68,7 @@ const char *btrfs_super_csum_driver(u16 csum_type) btrfs_csums[csum_type].name; } -size_t __const btrfs_get_num_csums(void) +size_t __attribute_const__ btrfs_get_num_csums(void) { return ARRAY_SIZE(btrfs_csums); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9c7e466f27a9..729b5b80014a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2262,7 +2262,7 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); -size_t __const btrfs_get_num_csums(void); +size_t __attribute_const__ btrfs_get_num_csums(void); /* From bbc37d6e475eee8ffa2156ec813efc6bbb43c06d Mon Sep 17 00:00:00 2001 From: Filipe Manana <fdmanana@suse.com> Date: Fri, 14 Aug 2020 11:04:09 +0100 Subject: [PATCH 4/6] btrfs: fix space cache memory leak after transaction abort If a transaction aborts it can cause a memory leak of the pages array of a block group's io_ctl structure. The following steps explain how that can happen: 1) Transaction N is committing, currently in state TRANS_STATE_UNBLOCKED and it's about to start writing out dirty extent buffers; 2) Transaction N + 1 already started and another task, task A, just called btrfs_commit_transaction() on it; 3) Block group B was dirtied (extents allocated from it) by transaction N + 1, so when task A calls btrfs_start_dirty_block_groups(), at the very beginning of the transaction commit, it starts writeback for the block group's space cache by calling btrfs_write_out_cache(), which allocates the pages array for the block group's io_ctl with a call to io_ctl_init(). Block group A is added to the io_list of transaction N + 1 by btrfs_start_dirty_block_groups(); 4) While transaction N's commit is writing out the extent buffers, it gets an IO error and aborts transaction N, also setting the file system to RO mode; 5) Task A has already returned from btrfs_start_dirty_block_groups(), is at btrfs_commit_transaction() and has set transaction N + 1 state to TRANS_STATE_COMMIT_START. Immediately after that it checks that the filesystem was turned to RO mode, due to transaction N's abort, and jumps to the "cleanup_transaction" label. After that we end up at btrfs_cleanup_one_transaction() which calls btrfs_cleanup_dirty_bgs(). That helper finds block group B in the transaction's io_list but it never releases the pages array of the block group's io_ctl, resulting in a memory leak. In fact at the point when we are at btrfs_cleanup_dirty_bgs(), the pages array points to pages that were already released by us at __btrfs_write_out_cache() through the call to io_ctl_drop_pages(). We end up freeing the pages array only after waiting for the ordered extent to complete through btrfs_wait_cache_io(), which calls io_ctl_free() to do that. But in the transaction abort case we don't wait for the space cache's ordered extent to complete through a call to btrfs_wait_cache_io(), so that's why we end up with a memory leak - we wait for the ordered extent to complete indirectly by shutting down the work queues and waiting for any jobs in them to complete before returning from close_ctree(). We can solve the leak simply by freeing the pages array right after releasing the pages (with the call to io_ctl_drop_pages()) at __btrfs_write_out_cache(), since we will never use it anymore after that and the pages array points to already released pages at that point, which is currently not a problem since no one will use it after that, but not a good practice anyway since it can easily lead to use-after-free issues. So fix this by freeing the pages array right after releasing the pages at __btrfs_write_out_cache(). This issue can often be reproduced with test case generic/475 from fstests and kmemleak can detect it and reports it with the following trace: unreferenced object 0xffff9bbf009fa600 (size 512): comm "fsstress", pid 38807, jiffies 4298504428 (age 22.028s) hex dump (first 32 bytes): 00 a0 7c 4d 3d ed ff ff 40 a0 7c 4d 3d ed ff ff ..|M=...@.|M=... 80 a0 7c 4d 3d ed ff ff c0 a0 7c 4d 3d ed ff ff ..|M=.....|M=... backtrace: [<00000000f4b5cfe2>] __kmalloc+0x1a8/0x3e0 [<0000000028665e7f>] io_ctl_init+0xa7/0x120 [btrfs] [<00000000a1f95b2d>] __btrfs_write_out_cache+0x86/0x4a0 [btrfs] [<00000000207ea1b0>] btrfs_write_out_cache+0x7f/0xf0 [btrfs] [<00000000af21f534>] btrfs_start_dirty_block_groups+0x27b/0x580 [btrfs] [<00000000c3c23d44>] btrfs_commit_transaction+0xa6f/0xe70 [btrfs] [<000000009588930c>] create_subvol+0x581/0x9a0 [btrfs] [<000000009ef2fd7f>] btrfs_mksubvol+0x3fb/0x4a0 [btrfs] [<00000000474e5187>] __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs] [<00000000708ee349>] btrfs_ioctl_snap_create_v2+0xb0/0xf0 [btrfs] [<00000000ea60106f>] btrfs_ioctl+0x12c/0x3130 [btrfs] [<000000005c923d6d>] __x64_sys_ioctl+0x83/0xb0 [<0000000043ace2c9>] do_syscall_64+0x33/0x80 [<00000000904efbce>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/disk-io.c | 1 + fs/btrfs/free-space-cache.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c850d7f44fbe..465bc8372e09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4574,6 +4574,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) cache->io_ctl.inode = NULL; iput(inode); } + ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef0fd7afb0b1..dc82fd0c80cb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1186,7 +1186,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root, ret = update_cache_item(trans, root, inode, path, offset, io_ctl->entries, io_ctl->bitmaps); out: - io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; @@ -1347,6 +1346,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * them out later */ io_ctl_drop_pages(io_ctl); + io_ctl_free(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); From fb2fecbad50964b9f27a3b182e74e437b40753ef Mon Sep 17 00:00:00 2001 From: Josef Bacik <josef@toxicpanda.com> Date: Mon, 10 Aug 2020 17:31:16 -0400 Subject: [PATCH 5/6] btrfs: check the right error variable in btrfs_del_dir_entries_in_log With my new locking code dbench is so much faster that I tripped over a transaction abort from ENOSPC. This turned out to be because btrfs_del_dir_entries_in_log was checking for ret == -ENOSPC, but this function sets err on error, and returns err. So instead of properly marking the inode as needing a full commit, we were returning -ENOSPC and aborting in __btrfs_unlink_inode. Fix this by checking the proper variable so that we return the correct thing in the case of ENOSPC. The ENOENT needs to be checked, because btrfs_lookup_dir_item_index() can return -ENOENT if the dir item isn't in the tree log (which would happen if we hadn't fsync'ed this guy). We actually handle that case in __btrfs_unlink_inode, so it's an expected error to get back. Fixes: 4a500fd178c8 ("Btrfs: Metadata ENOSPC handling for tree log") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add note and comment about ENOENT ] Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/tree-log.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 696dd861cc3c..39da9db35278 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3449,11 +3449,13 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (ret == -ENOSPC) { + if (err == -ENOSPC) { btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0) - btrfs_abort_transaction(trans, ret); + err = 0; + } else if (err < 0 && err != -ENOENT) { + /* ENOENT can be returned if the entry hasn't been fsynced yet */ + btrfs_abort_transaction(trans, err); + } btrfs_end_log_trans(root); From a84d5d429f9eb56f81b388609841ed993f0ddfca Mon Sep 17 00:00:00 2001 From: Boris Burkov <boris@bur.io> Date: Tue, 18 Aug 2020 11:00:05 -0700 Subject: [PATCH 6/6] btrfs: detect nocow for swap after snapshot delete can_nocow_extent and btrfs_cross_ref_exist both rely on a heuristic for detecting a must cow condition which is not exactly accurate, but saves unnecessary tree traversal. The incorrect assumption is that if the extent was created in a generation smaller than the last snapshot generation, it must be referenced by that snapshot. That is true, except the snapshot could have since been deleted, without affecting the last snapshot generation. The original patch claimed a performance win from this check, but it also leads to a bug where you are unable to use a swapfile if you ever snapshotted the subvolume it's in. Make the check slower and more strict for the swapon case, without modifying the general cow checks as a compromise. Turning swap on does not seem to be a particularly performance sensitive operation, so incurring a possibly unnecessary btrfs_search_slot seems worthwhile for the added usability. Note: Until the snapshot is competely cleaned after deletion, check_committed_refs will still cause the logic to think that cow is necessary, so the user must until 'btrfs subvolu sync' finished before activating the swapfile swapon. CC: stable@vger.kernel.org # 5.4+ Suggested-by: Omar Sandoval <osandov@osandov.com> Signed-off-by: Boris Burkov <boris@bur.io> Signed-off-by: David Sterba <dsterba@suse.com> --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 17 +++++++++++------ fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 18 +++++++++++------- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 729b5b80014a..9a72896bed2e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2518,7 +2518,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr); + u64 objectid, u64 offset, u64 bytenr, bool strict); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2934,7 +2934,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes); + u64 *ram_bytes, bool strict); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index de6fe176fdfb..5871ef78edba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2306,7 +2306,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root, static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_path *path, - u64 objectid, u64 offset, u64 bytenr) + u64 objectid, u64 offset, u64 bytenr, + bool strict) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = fs_info->extent_root; @@ -2348,9 +2349,13 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) goto out; - /* If extent created before last snapshot => it's definitely shared */ - if (btrfs_extent_generation(leaf, ei) <= - btrfs_root_last_snapshot(&root->root_item)) + /* + * If extent created before last snapshot => it's shared unless the + * snapshot has been deleted. Use the heuristic if strict is false. + */ + if (!strict && + (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); @@ -2375,7 +2380,7 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr) + u64 bytenr, bool strict) { struct btrfs_path *path; int ret; @@ -2386,7 +2391,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, do { ret = check_committed_ref(root, path, objectid, - offset, bytenr); + offset, bytenr, strict); if (ret && ret != -ENOENT) goto out; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841c516079a9..3abaff4d2cff 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1571,7 +1571,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, NULL, NULL); + NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; if (!nowait) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4ad0323b3684..46e04247193a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1609,7 +1609,7 @@ next_slot: goto out_check; ret = btrfs_cross_ref_exist(root, ino, found_key.offset - - extent_offset, disk_bytenr); + extent_offset, disk_bytenr, false); if (ret) { /* * ret could be -EIO if the above fails to read @@ -6949,6 +6949,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent + * @strict: if true, omit optimizations that might force us into unnecessary + * cow. e.g., don't trust generation number. * * This function will flush ordered extents in the range to ensure proper * nocow checks for (nowait == false) case. @@ -6963,7 +6965,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes) + u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; @@ -7041,8 +7043,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, * Do the same check as in btrfs_cross_ref_exist but without the * unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) + if (!strict && + (btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item))) goto out; backref_offset = btrfs_file_extent_offset(leaf, fi); @@ -7078,7 +7081,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr); + key.offset - backref_offset, disk_bytenr, + strict); if (ret) { ret = 0; goto out; @@ -7299,7 +7303,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1 && + &orig_block_len, &ram_bytes, false) == 1 && btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; @@ -10130,7 +10134,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, free_extent_map(em); em = NULL; - ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true); if (ret < 0) { goto out; } else if (ret) {