From 8c4cba2adbb0ec63f3833cad7452a431580e9ffa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 8 Jul 2024 13:24:08 +0200 Subject: [PATCH 001/110] btrfs: update stripe extents for existing logical addresses Update a stripe extent in case of an already existing logical address, but with different physical addresses and/or device id instead of bailing out with EEXIST. This can happen i.e. in case of a device replace operation, where data extents get rewritten to a new disk. Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index e6f7a234b8f6..0c7b928805e5 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -73,6 +73,36 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le return ret; } +static int update_raid_extent_item(struct btrfs_trans_handle *trans, + struct btrfs_key *key, + struct btrfs_stripe_extent *stripe_extent, + const size_t item_size) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + int slot; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path, + 0, 1); + if (ret) + return (ret == 1 ? ret : -EINVAL); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot), + item_size); + btrfs_mark_buffer_dirty(trans, leaf); + btrfs_free_path(path); + + return ret; +} + static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_io_context *bioc) { @@ -112,6 +142,9 @@ static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, item_size); + if (ret == -EEXIST) + ret = update_raid_extent_item(trans, &stripe_key, stripe_extent, + item_size); if (ret) btrfs_abort_transaction(trans, ret); From 7fa5230b46f2c333f090716c52e99a0fbbee5fbd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 9 Jul 2024 09:40:34 +0200 Subject: [PATCH 002/110] btrfs: update stripe_extent delete loop assumptions btrfs_delete_raid_extent() was written under the assumption, that it's call-chain always passes a start, length tuple that matches a single extent. But btrfs_delete_raid_extent() is called by do_free_extent_accounting() which in turn is called by __btrfs_free_extent(). But this call-chain passes in a start address and a length that can possibly match multiple on-disk extents. To make this possible, we have to adjust the start and length of each btree node lookup, to not delete beyond the requested range. Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 0c7b928805e5..bd06ff795691 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -66,6 +66,11 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le if (ret) break; + start += key.offset; + length -= key.offset; + if (length == 0) + break; + btrfs_release_path(path); } From ab094670fab468aef551aafc197cdf4cf1a2e611 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 23 Jul 2024 15:21:22 +0100 Subject: [PATCH 003/110] btrfs: reduce size and overhead of extent_map_block_end() At extent_map_block_end() we are calling the inline functions extent_map_block_start() and extent_map_block_len() multiple times, which results in expanding their code multiple times, increasing the compiled code size and repeating the computations those functions do. Improve this by caching their results in local variables. The size of the module before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1755770 163800 16920 1936490 1d8c6a fs/btrfs/btrfs.ko And after this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1755656 163800 16920 1936376 1d8bf8 fs/btrfs/btrfs.ko Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 10ac5f657e38..25d191f1ac10 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -192,10 +192,13 @@ static inline u64 extent_map_block_len(const struct extent_map *em) static inline u64 extent_map_block_end(const struct extent_map *em) { - if (extent_map_block_start(em) + extent_map_block_len(em) < - extent_map_block_start(em)) + const u64 block_start = extent_map_block_start(em); + const u64 block_end = block_start + extent_map_block_len(em); + + if (block_end < block_start) return (u64)-1; - return extent_map_block_start(em) + extent_map_block_len(em); + + return block_end; } static bool can_merge_extent_map(const struct extent_map *em) From c92bf5df8a120f4ee91832faee373570d83cbb13 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jul 2024 14:29:02 +0930 Subject: [PATCH 004/110] btrfs: move uuid tree related code to uuid-tree.[ch] Functions btrfs_uuid_scan_kthread() and btrfs_create_uuid_tree() are for UUID tree rescan and creation, it's not suitable for volumes.[ch]. Move them to uuid-tree.[ch] instead. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 179 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/uuid-tree.h | 2 + fs/btrfs/volumes.c | 177 ------------------------------------------ fs/btrfs/volumes.h | 2 - 4 files changed, 181 insertions(+), 179 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index eae75bb572b9..c6399513c66f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -3,6 +3,7 @@ * Copyright (C) STRATO AG 2013. All rights reserved. */ +#include #include #include #include "messages.h" @@ -12,6 +13,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static void btrfs_uuid_to_key(const u8 *uuid, u8 type, struct btrfs_key *key) { @@ -390,3 +392,180 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + bool closing = false; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + while (1) { + if (btrfs_fs_closing(fs_info)) { + closing = true; + break; + } + ret = btrfs_search_forward(root, &key, path, + BTRFS_OLDEST_GENERATION); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + btrfs_release_path(path); + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + btrfs_warn(fs_info, "uuid_tree_add failed %d", + ret); + break; + } + } + +skip: + btrfs_release_path(path); + if (trans) { + ret = btrfs_end_transaction(trans); + trans = NULL; + if (ret) + break; + } + + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans); + if (ret) + btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); + else if (!closing) + set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + btrfs_warn(fs_info, "failed to start uuid_scan task"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h index a3f5757cc7cf..c60ad20325cc 100644 --- a/fs/btrfs/uuid-tree.h +++ b/fs/btrfs/uuid-tree.h @@ -13,5 +13,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_uuid_scan_kthread(void *data); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fcedc43ef291..ff7c7194f5f7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4784,183 +4784,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_uuid_scan_kthread(void *data) -{ - struct btrfs_fs_info *fs_info = data; - struct btrfs_root *root = fs_info->tree_root; - struct btrfs_key key; - struct btrfs_path *path = NULL; - int ret = 0; - struct extent_buffer *eb; - int slot; - struct btrfs_root_item root_item; - u32 item_size; - struct btrfs_trans_handle *trans = NULL; - bool closing = false; - - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - key.objectid = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = 0; - - while (1) { - if (btrfs_fs_closing(fs_info)) { - closing = true; - break; - } - ret = btrfs_search_forward(root, &key, path, - BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - break; - } - - if (key.type != BTRFS_ROOT_ITEM_KEY || - (key.objectid < BTRFS_FIRST_FREE_OBJECTID && - key.objectid != BTRFS_FS_TREE_OBJECTID) || - key.objectid > BTRFS_LAST_FREE_OBJECTID) - goto skip; - - eb = path->nodes[0]; - slot = path->slots[0]; - item_size = btrfs_item_size(eb, slot); - if (item_size < sizeof(root_item)) - goto skip; - - read_extent_buffer(eb, &root_item, - btrfs_item_ptr_offset(eb, slot), - (int)sizeof(root_item)); - if (btrfs_root_refs(&root_item) == 0) - goto skip; - - if (!btrfs_is_empty_uuid(root_item.uuid) || - !btrfs_is_empty_uuid(root_item.received_uuid)) { - if (trans) - goto update_tree; - - btrfs_release_path(path); - /* - * 1 - subvol uuid item - * 1 - received_subvol uuid item - */ - trans = btrfs_start_transaction(fs_info->uuid_root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - continue; - } else { - goto skip; - } -update_tree: - btrfs_release_path(path); - if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - - if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, - root_item.received_uuid, - BTRFS_UUID_KEY_RECEIVED_SUBVOL, - key.objectid); - if (ret < 0) { - btrfs_warn(fs_info, "uuid_tree_add failed %d", - ret); - break; - } - } - -skip: - btrfs_release_path(path); - if (trans) { - ret = btrfs_end_transaction(trans); - trans = NULL; - if (ret) - break; - } - - if (key.offset < (u64)-1) { - key.offset++; - } else if (key.type < BTRFS_ROOT_ITEM_KEY) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - } else if (key.objectid < (u64)-1) { - key.offset = 0; - key.type = BTRFS_ROOT_ITEM_KEY; - key.objectid++; - } else { - break; - } - cond_resched(); - } - -out: - btrfs_free_path(path); - if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); - else if (!closing) - set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); - up(&fs_info->uuid_tree_rescan_sem); - return 0; -} - -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *tree_root = fs_info->tree_root; - struct btrfs_root *uuid_root; - struct task_struct *task; - int ret; - - /* - * 1 - root node - * 1 - root item - */ - trans = btrfs_start_transaction(tree_root, 2); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - fs_info->uuid_root = uuid_root; - - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - - down(&fs_info->uuid_tree_rescan_sem); - task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); - if (IS_ERR(task)) { - /* fs_info->update_uuid_tree_gen remains 0 in all error case */ - btrfs_warn(fs_info, "failed to start uuid_scan task"); - up(&fs_info->uuid_tree_rescan_sem); - return PTR_ERR(task); - } - - return 0; -} - /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 37a09ebb34dd..c947187539dd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -725,8 +725,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); -int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, From 45714ff75c3618a191a952ce96ec15724bd4fdb3 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 18 Jul 2024 00:58:54 +0800 Subject: [PATCH 005/110] btrfs: print message on device opening error during mount [ENHANCEMENT] When mounting a btrfs filesystem, the filesystem opens the block device, and if this fails, there is no message about it. Print a message about it to help debugging. [TEST] I have a btrfs filesystem on three block devices, one of which is write-protected, so regular mounts fail, but there is no message in dmesg. /dev/vdb normal /dev/vdc write protected /dev/vdd normal Before patch: $ sudo mount /dev/vdb /mnt/ mount: mount(2) failed: no such file or directory $ sudo dmesg # Show only messages about missing block devices .... [ 352.947196] BTRFS error (device vdb): devid 2 uuid 4ee2c625-a3b2-4fe0-b411-756b23e08533 missing .... After patch: $ sudo mount /dev/vdb /mnt/ mount: mount(2) failed: no such file or directory $ sudo dmesg # Show bdev_file_open_by_path failed. .... [ 352.944328] BTRFS error: failed to open device for path /dev/vdc with flags 0x3: -13 [ 352.947196] BTRFS error (device vdb): missing devid 2 uuid 4ee2c625-a3b2-4fe0-b411-756b23e08533 .... Signed-off-by: Li Zhang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ff7c7194f5f7..e07452207426 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -476,6 +476,8 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, if (IS_ERR(*bdev_file)) { ret = PTR_ERR(*bdev_file); + btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d", + device_path, flags, ret); goto error; } bdev = file_bdev(*bdev_file); From 03d6612648a48d0f5a60a013d4b583b4886807d2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 15:55:33 -0400 Subject: [PATCH 006/110] btrfs: convert btrfs_readahead() to only use folio We're the only user of readahead_page_batch(). Convert btrfs_readahead() to use the folio based helpers to do readahead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c73cd4f89015..2798a3ca1db4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1176,26 +1176,6 @@ int btrfs_read_folio(struct file *file, struct folio *folio) return ret; } -static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - u64 *prev_em_start) -{ - struct btrfs_inode *inode = page_to_inode(pages[0]); - int index; - - ASSERT(em_cached); - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio_ctrl, - prev_em_start); - put_page(pages[index]); - } -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -2379,18 +2359,18 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct page *pagepool[16]; + struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); + struct folio *folio; + u64 start = readahead_pos(rac); + u64 end = start + readahead_length(rac) - 1; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - int nr; - while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = readahead_pos(rac); - u64 contig_end = contig_start + readahead_batch_length(rac) - 1; + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio_ctrl, &prev_em_start); - } + while ((folio = readahead_folio(rac)) != NULL) + btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, + &prev_em_start); if (em_cached) free_extent_map(em_cached); From e19317ed9e7c5b8646713d3d7b53b8312673faa4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 15:59:34 -0400 Subject: [PATCH 007/110] btrfs: convert btrfs_read_folio() to only use a folio Currently we're using the page for everything here. Convert this to use the folio helpers instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2798a3ca1db4..5e9e4671edef 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1155,17 +1155,16 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct btrfs_inode *inode = page_to_inode(page); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; + struct btrfs_inode *inode = folio_to_inode(folio); + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* From 645006d87cd8236855eedc860ef16842c34e1296 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:16:20 -0400 Subject: [PATCH 008/110] btrfs: convert end_page_read() to take a folio We have this helper function to set the page range uptodate once we're done reading it, as well as run fsverity against it. Half of these functions already take a folio, just rename this to end_folio_read and then rework it to take a folio instead, and update everything accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5e9e4671edef..feec56a77d9b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -406,30 +406,31 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops); } -static bool btrfs_verify_page(struct page *page, u64 start) +static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) { - if (!fsverity_active(page->mapping->host) || - PageUptodate(page) || - start >= i_size_read(page->mapping->host)) + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + + if (!fsverity_active(folio->mapping->host) || + btrfs_folio_test_uptodate(fs_info, folio, start, len) || + start >= i_size_read(folio->mapping->host)) return true; - return fsverity_verify_page(page); + return fsverity_verify_folio(folio); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 len) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) + if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page->mapping)) - unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) + folio_unlock(folio); else btrfs_subpage_end_reader(fs_info, folio, start, len); } @@ -642,7 +643,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_page_read(folio_page(folio, 0), uptodate, start, len); + end_folio_read(folio, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -1048,13 +1049,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); break; } em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); - end_page_read(page, false, cur, end + 1 - cur); + end_folio_read(page_folio(page), false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1123,7 +1124,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1131,7 +1132,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* the get_extent function already copied into the page */ if (block_start == EXTENT_MAP_INLINE) { unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_page_read(page, true, cur, iosize); + end_folio_read(page_folio(page), true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -2551,7 +2552,7 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli return true; /* * Even there is no eb refs here, we may still have - * end_page_read() call relying on page::private. + * end_folio_read() call relying on page::private. */ if (atomic_read(&subpage->readers)) return true; From fcf50d161c622f18d8ecc4f0925b452349d9f1f0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:19:12 -0400 Subject: [PATCH 009/110] btrfs: convert begin_page_folio() to take a folio instead This already uses a folio internally, change it to take a folio as an argument instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index feec56a77d9b..5a69fb566fa0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -551,16 +551,14 @@ update: processed->uptodate = uptodate; } -static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { - struct folio *folio = page_folio(page); - ASSERT(folio_test_locked(folio)); if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio)); - btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); + btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); } /* @@ -1038,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_page_read(fs_info, page); + begin_folio_read(fs_info, page_folio(page)); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; From b35397d1d325f43a360489bb6f740b40668d7005 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 16:32:29 -0400 Subject: [PATCH 010/110] btrfs: convert submit_extent_page() to use a folio The callers of this helper are going to be converted to using a folio, so adjust submit_extent_page to become submit_extent_folio and update it to use all the relevant folio helpers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5a69fb566fa0..6829f44ea083 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -736,12 +736,13 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, u64 disk_bytenr, + struct folio *folio, u64 disk_bytenr, unsigned int pg_offset) { struct bio *bio = &bio_ctrl->bbio->bio; struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -754,7 +755,7 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, /* * The contig check requires the following conditions to be met: * - * 1) The pages are belonging to the same inode + * 1) The folios are belonging to the same inode * This is implied by the call chain. * * 2) The range has adjacent logical bytenr @@ -763,8 +764,8 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, * This is required for the usage of btrfs_bio->file_offset. */ return bio_end_sector(bio) == sector && - page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == - page_offset(page) + pg_offset; + folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == + folio_pos(folio) + pg_offset; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -817,17 +818,17 @@ static void alloc_new_bio(struct btrfs_inode *inode, * The mirror number for this IO should already be initizlied in * @bio_ctrl->mirror_num. */ -static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, - u64 disk_bytenr, struct page *page, +static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, + u64 disk_bytenr, struct folio *folio, size_t size, unsigned long pg_offset) { - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); ASSERT(pg_offset + size <= PAGE_SIZE); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) submit_one_bio(bio_ctrl); do { @@ -836,7 +837,7 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, /* Allocate new bio if needed */ if (!bio_ctrl->bbio) { alloc_new_bio(inode, bio_ctrl, disk_bytenr, - page_offset(page) + pg_offset); + folio_pos(folio) + pg_offset); } /* Cap to the current ordered extent boundary if there is one. */ @@ -846,21 +847,22 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, len = bio_ctrl->len_to_oe_boundary; } - if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { + if (!bio_add_folio(&bio_ctrl->bbio->bio, folio, len, pg_offset)) { /* bio full: move on to a new one */ submit_one_bio(bio_ctrl); continue; } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); + wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + len); size -= len; pg_offset += len; disk_bytenr += len; /* - * len_to_oe_boundary defaults to U32_MAX, which isn't page or + * len_to_oe_boundary defaults to U32_MAX, which isn't folio or * sector aligned. alloc_new_bio() then sets it to the end of * our ordered extent for writes into zoned devices. * @@ -870,15 +872,15 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, * boundary is correct. * * When len_to_oe_boundary is U32_MAX, the cap above would - * result in a 4095 byte IO for the last page right before - * we hit the bio limit of UINT_MAX. bio_add_page() has all + * result in a 4095 byte IO for the last folio right before + * we hit the bio limit of UINT_MAX. bio_add_folio() has all * the checks required to make sure we don't overflow the bio, * and we should just ignore len_to_oe_boundary completely * unless we're using it to track an ordered extent. * * It's pretty hard to make a bio sized U32_MAX, but it can * happen when the page cache is able to feed us contiguous - * pages for large extents. + * folios for large extents. */ if (bio_ctrl->len_to_oe_boundary != U32_MAX) bio_ctrl->len_to_oe_boundary -= len; @@ -1143,8 +1145,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), + iosize, pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1489,8 +1491,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } - submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, - cur - page_offset(page)); + submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), + iosize, cur - page_offset(page)); cur += iosize; nr++; } @@ -2087,7 +2089,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. * - * We can get ret > 0 from submit_extent_page() indicating how many ebs + * We can get ret > 0 from submit_extent_folio() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) From 56a24a30a45603204bdee2f0fb280ee9eb723c11 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 17:06:03 -0400 Subject: [PATCH 011/110] btrfs: convert btrfs_do_readpage() to only use a folio Now that the callers and helpers mostly use folio, convert btrfs_do_readpage to take a folio, and rename it to btrfs_do_read_folio. Update all of the page stuff to use the folio based helpers instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6829f44ea083..6cabeab5d21c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1004,12 +1004,12 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - u64 start = page_offset(page); + u64 start = folio_pos(folio); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; @@ -1022,23 +1022,23 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t blocksize = fs_info->sectorsize; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { unlock_extent(tree, start, end, NULL); - unlock_page(page); + folio_unlock(folio); return ret; } - if (page->index == last_byte >> PAGE_SHIFT) { - size_t zero_offset = offset_in_page(last_byte); + if (folio->index == last_byte >> folio_shift(folio)) { + size_t zero_offset = offset_in_folio(folio, last_byte); if (zero_offset) { - iosize = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, iosize); + iosize = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, iosize); } } bio_ctrl->end_io_func = end_bbio_data_read; - begin_folio_read(fs_info, page_folio(page)); + begin_folio_read(fs_info, folio); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; bool force_bio_submit = false; @@ -1046,16 +1046,17 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = PAGE_SIZE - pg_offset; - memzero_page(page, pg_offset, iosize); + iosize = folio_size(folio) - pg_offset; + folio_zero_range(folio, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, page, cur, end - cur + 1, em_cached); + em = __get_extent_map(inode, folio_page(folio, 0), cur, + end - cur + 1, em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); - end_folio_read(page_folio(page), false, cur, end + 1 - cur); + end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; @@ -1080,8 +1081,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a - * single bio to populate the pages for the 2 ranges because - * this makes the compressed extent read zero out the pages + * single bio to populate the folios for the 2 ranges because + * this makes the compressed extent read zero out the folios * belonging to the 2nd range. Imagine the following scenario: * * File layout @@ -1094,13 +1095,13 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * [extent X, compressed length = 4K uncompressed length = 16K] * * If the bio to read the compressed extent covers both ranges, - * it will decompress extent X into the pages belonging to the + * it will decompress extent X into the folios belonging to the * first range and then it will stop, zeroing out the remaining - * pages that belong to the other range that points to extent X. + * folios that belong to the other range that points to extent X. * So here we make sure we submit 2 bios, one for the first * range and another one for the third range. Both will target * the same physical extent from disk, but we can't currently - * make the compressed bio endio callback populate the pages + * make the compressed bio endio callback populate the folios * for both ranges because each compressed bio is tightly * coupled with a single extent map, and each range can have * an extent map with a different offset value relative to the @@ -1121,18 +1122,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - memzero_page(page, pg_offset, iosize); + folio_zero_range(folio, pg_offset, iosize); unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; } - /* the get_extent function already copied into the page */ + /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { unlock_extent(tree, cur, cur + iosize - 1, NULL); - end_folio_read(page_folio(page), true, cur, iosize); + end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -1145,8 +1146,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), - iosize, pg_offset); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + pg_offset); cur = cur + iosize; pg_offset += iosize; } @@ -1165,7 +1166,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, NULL); + ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); /* @@ -2369,8 +2370,7 @@ void btrfs_readahead(struct readahead_control *rac) btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); while ((folio = readahead_folio(rac)) != NULL) - btrfs_do_readpage(&folio->page, &em_cached, &bio_ctrl, - &prev_em_start); + btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); if (em_cached) free_extent_map(em_cached); From 9e97e8b277a2235bbb562a4feb6f1216fb52d1b1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 23 Jul 2024 17:12:15 -0400 Subject: [PATCH 012/110] btrfs: update the writepage tracepoint to take a folio Willy is wanting to get rid of page->index, convert the writepage tracepoint to take a folio so we can do folio->index instead of page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- include/trace/events/btrfs.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6cabeab5d21c..51cadf9e61a1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1531,7 +1531,7 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(page, inode, bio_ctrl->wbc); + trace___extent_writepage(folio, inode, bio_ctrl->wbc); WARN_ON(!PageLocked(page)); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0a523023bdcc..0eddbb8b6728 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -674,10 +674,10 @@ TRACE_EVENT(btrfs_finish_ordered_extent, DECLARE_EVENT_CLASS(btrfs__writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc), + TP_ARGS(folio, inode, wbc), TP_STRUCT__entry_btrfs( __field( u64, ino ) @@ -695,7 +695,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->index = page->index; + __entry->index = folio->index; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; @@ -723,10 +723,10 @@ DECLARE_EVENT_CLASS(btrfs__writepage, DEFINE_EVENT(btrfs__writepage, __extent_writepage, - TP_PROTO(const struct page *page, const struct inode *inode, + TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), - TP_ARGS(page, inode, wbc) + TP_ARGS(folio, inode, wbc) ); TRACE_EVENT(btrfs_writepage_end_io_hook, From b8a6263eae0e4290fd699a0ff55eb3f3e121f498 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:38:01 -0400 Subject: [PATCH 013/110] btrfs: convert __extent_writepage_io() to take a folio __extent_writepage_io uses page everywhere, but a lot of these functions take a folio. Convert it to use the folio based helpers, and then change it to take a folio as an argument and update its callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 51cadf9e61a1..ebd4867f3dd7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1393,10 +1393,10 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, * < 0 if there were errors (page still locked) */ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct page *page, u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, - int *nr_ret) + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 cur = start; @@ -1407,14 +1407,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int ret = 0; int nr = 0; - ASSERT(start >= page_offset(page) && - start + len <= page_offset(page) + PAGE_SIZE); + ASSERT(start >= folio_pos(folio) && + start + len <= folio_pos(folio) + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(page); + ret = btrfs_writepage_cow_fixup(&folio->page); if (ret) { /* Fixup worker will requeue */ - redirty_page_for_writepage(bio_ctrl->wbc, page); - unlock_page(page); + folio_redirty_for_writepage(bio_ctrl->wbc, folio); + folio_unlock(folio); return 1; } @@ -1428,21 +1428,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, page, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, &folio->page, cur, + len, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. * But we still need to clear the dirty subpage bit, or - * the next time the page gets dirtied, we will try to + * the next time the folio gets dirtied, we will try to * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, len); break; } - find_next_dirty_byte(fs_info, page, &dirty_range_start, + find_next_dirty_byte(fs_info, &folio->page, &dirty_range_start, &dirty_range_end); if (cur < dirty_range_start) { cur = dirty_range_start; @@ -1478,33 +1478,32 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, em = NULL; /* - * Although the PageDirty bit might be cleared before entering - * this function, subpage dirty bit is not cleared. + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. * So clear subpage dirty bit here so next time we won't submit - * page for range already written to disk. + * folio for range already written to disk. */ - btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); + btrfs_folio_clear_dirty(fs_info, folio, cur, iosize); btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!PageWriteback(page)) { + if (!folio_test_writeback(folio)) { btrfs_err(inode->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); + "folio %lu not writeback, cur %llu end %llu", + folio->index, cur, end); } - - submit_extent_folio(bio_ctrl, disk_bytenr, page_folio(page), - iosize, cur - page_offset(page)); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + iosize, cur - folio_pos(folio)); cur += iosize; nr++; } - btrfs_folio_assert_not_dirty(fs_info, page_folio(page), start, len); + btrfs_folio_assert_not_dirty(fs_info, folio, start, len); *nr_ret = nr; return 0; out_error: /* - * If we finish without problem, we should not only clear page dirty, + * If we finish without problem, we should not only clear folio dirty, * but also empty subpage dirty bits */ *nr_ret = nr; @@ -1556,7 +1555,7 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), page, page_offset(page), + ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), PAGE_SIZE, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -2308,7 +2307,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa if (pages_dirty && page != locked_page) ASSERT(PageDirty(page)); - ret = __extent_writepage_io(BTRFS_I(inode), page, cur, cur_len, + ret = __extent_writepage_io(BTRFS_I(inode), page_folio(page), cur, cur_len, &bio_ctrl, i_size, &nr); if (ret == 1) goto next_page; From c1deaa1438916f263abfa48b389ef0625c2806ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:50:51 -0400 Subject: [PATCH 014/110] btrfs: convert extent_write_locked_range() to use folios Instead of using pages for everything, find a folio and use that. This makes things a bit cleaner as a lot of the functions calls here all take folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ebd4867f3dd7..06f75d80e37a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2299,37 +2299,47 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; - struct page *page; + struct folio *folio; int nr = 0; - page = find_get_page(mapping, cur >> PAGE_SHIFT); - ASSERT(PageLocked(page)); - if (pages_dirty && page != locked_page) - ASSERT(PageDirty(page)); + folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); - ret = __extent_writepage_io(BTRFS_I(inode), page_folio(page), cur, cur_len, + /* + * This shouldn't happen, the pages are pinned and locked, this + * code is just in case, but shouldn't actually be run. + */ + if (IS_ERR(folio)) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + cur, cur_len, false); + mapping_set_error(mapping, PTR_ERR(folio)); + cur = cur_end + 1; + continue; + } + + ASSERT(folio_test_locked(folio)); + if (pages_dirty && &folio->page != locked_page) + ASSERT(folio_test_dirty(folio)); + + ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, &bio_ctrl, i_size, &nr); if (ret == 1) goto next_page; /* Make sure the mapping tag for page dirty gets cleared. */ if (nr == 0) { - struct folio *folio; - - folio = page_folio(page); btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, cur, cur_len, !ret); - mapping_set_error(page->mapping, ret); + mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); + btrfs_folio_unlock_writer(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: - put_page(page); + folio_put(folio); cur = cur_end + 1; } From 9b320229c03bbc45e60c59041e79b3cedcea2fdf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 14:58:22 -0400 Subject: [PATCH 015/110] btrfs: convert __extent_writepage() to be completely folio based Now that we've gotten most of the helpers updated to only take a folio, update __extent_writepage to only deal in folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 06f75d80e37a..85a28becd3f9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1519,11 +1519,10 @@ out_error: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; - const u64 page_start = page_offset(page); + struct inode *inode = folio->mapping->host; + const u64 page_start = folio_pos(folio); int ret; int nr = 0; size_t pg_offset; @@ -1532,24 +1531,24 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl trace___extent_writepage(folio, inode, bio_ctrl->wbc); - WARN_ON(!PageLocked(page)); + WARN_ON(!folio_test_locked(folio)); - pg_offset = offset_in_page(i_size); - if (page->index > end_index || - (page->index == end_index && !pg_offset)) { + pg_offset = offset_in_folio(folio, i_size); + if (folio->index > end_index || + (folio->index == end_index && !pg_offset)) { folio_invalidate(folio, 0, folio_size(folio)); folio_unlock(folio); return 0; } - if (page->index == end_index) - memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); + if (folio->index == end_index) + folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), &folio->page, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) @@ -1565,13 +1564,13 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ - set_page_writeback(page); - end_page_writeback(page); + folio_start_writeback(folio); + folio_end_writeback(folio); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, - PAGE_SIZE, !ret); - mapping_set_error(page->mapping, ret); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + page_start, PAGE_SIZE, !ret); + mapping_set_error(folio->mapping, ret); } btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); @@ -2228,7 +2227,7 @@ retry: continue; } - ret = __extent_writepage(&folio->page, bio_ctrl); + ret = __extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; From c808c1dcb1b2fe7caf4729e895881d5a87b31621 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:16:40 -0400 Subject: [PATCH 016/110] btrfs: convert add_ra_bio_pages() to use only folios Willy is going to get rid of page->index, and add_ra_bio_pages uses page->index. Make his life easier by converting add_ra_bio_pages to use folios so that we are no longer using page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 62 ++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a8e2c461aff7..832ab8984c41 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -420,7 +420,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; - struct page *page; + struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; @@ -453,9 +453,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - page = xa_load(&mapping->i_pages, pg_index); - if (page && !xa_is_value(page)) { - sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + folio = __filemap_get_folio(mapping, pg_index, 0, 0); + if (!IS_ERR(folio)) { + u64 folio_sz = folio_size(folio); + u64 offset = offset_in_folio(folio, cur); + + folio_put(folio); + sectors_missed += (folio_sz - offset) >> fs_info->sectorsize_bits; /* Beyond threshold, no need to continue */ @@ -466,35 +470,35 @@ static noinline int add_ra_bio_pages(struct inode *inode, * Jump to next page start as we already have page for * current offset. */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += (folio_sz - offset); continue; } - page = __page_cache_alloc(mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, + ~__GFP_FS), 0); + if (!folio) break; - if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { - put_page(page); + if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { /* There is already a page, skip to page end */ - cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + cur += folio_size(folio); + folio_put(folio); continue; } - if (!*memstall && PageWorkingset(page)) { + if (!*memstall && folio_test_workingset(folio)) { psi_memstall_enter(pflags); *memstall = 1; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } - page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); @@ -511,28 +515,28 @@ static noinline int add_ra_bio_pages(struct inode *inode, orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); - if (page->index == end_index) { - size_t zero_offset = offset_in_page(isize); + if (folio->index == end_index) { + size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { int zeros; - zeros = PAGE_SIZE - zero_offset; - memzero_page(page, zero_offset, zeros); + zeros = folio_size(folio) - zero_offset; + folio_zero_range(folio, zero_offset, zeros); } } - ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); - if (ret != add_size) { + if (!bio_add_folio(orig_bio, folio, add_size, + offset_in_folio(folio, cur))) { unlock_extent(tree, cur, page_end, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } /* @@ -541,9 +545,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page_folio(page), - cur, add_size); - put_page(page); + btrfs_subpage_start_reader(fs_info, folio, cur, + add_size); + folio_put(folio); cur += add_size; } return 0; From 7e755aa731f704e733768c4e58650910bebc9ce3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:24:35 -0400 Subject: [PATCH 017/110] btrfs: utilize folio more in btrfs_page_mkwrite() We already have a folio that we're using in btrfs_page_mkwrite, update the rest of the function to use folio everywhere else. This will make it easier on Willy when he drops page->index. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2aeb8116549c..c7a7234998aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1920,8 +1920,8 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + page_start = folio_pos(folio); + page_end = page_start + folio_size(folio) - 1; end = page_end; /* @@ -1949,18 +1949,18 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; again: down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || + if ((folio->mapping != inode->i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); + ret2 = set_folio_extent_mapped(folio); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent(io_tree, page_start, page_end, &cached_state); @@ -1974,14 +1974,14 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } - if (page->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; @@ -2011,13 +2011,13 @@ again: } /* Page is wholly or partially inside EOF. */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); + if (page_start + folio_size(folio) > size) + zero_start = offset_in_folio(folio, size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); + folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); @@ -2034,7 +2034,7 @@ again: return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); From 0a577636a9399d40c1da68fc61ebfbe21f793739 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:49:47 -0400 Subject: [PATCH 018/110] btrfs: convert can_finish_ordered_extent() to use a folio Pass in a folio instead, and use a folio instead of a page. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 82a68394a89c..760a37512c7e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -332,7 +332,7 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -340,10 +340,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, lockdep_assert_held(&inode->ordered_tree_lock); - if (page) { - ASSERT(page->mapping); - ASSERT(page_offset(page) <= file_offset); - ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + if (folio) { + ASSERT(folio->mapping); + ASSERT(folio_pos(folio) <= file_offset); + ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio)); /* * Ordered (Private2) bit indicates whether we still have @@ -351,10 +351,9 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_folio_test_ordered(fs_info, page_folio(page), - file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) return false; - btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); + btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); } /* Now we're fine to update the accounting. */ @@ -408,7 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + ret = can_finish_ordered_extent(ordered, page_folio(page), file_offset, + len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* @@ -524,7 +524,8 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, page_folio(page), cur, len, + uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); From aef665d69ad15afaebdc2c32b3e58fc526ba6c3d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:53:18 -0400 Subject: [PATCH 019/110] btrfs: convert btrfs_finish_ordered_extent() to take a folio The callers and callee's of this now all use folios, update it to take a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 85a28becd3f9..cf2022ea2d82 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -472,8 +472,8 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) "incomplete page write with offset %zu and length %zu", fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, - folio_page(folio, 0), start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, + !error); if (error) mapping_set_error(folio->mapping, error); btrfs_folio_clear_writeback(fs_info, folio, start, len); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760a37512c7e..e97747956040 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -397,7 +397,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) } void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; @@ -407,8 +407,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - ret = can_finish_ordered_extent(ordered, page_folio(page), file_offset, - len, uptodate); + ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 51b9e81726e2..90c1c3c51ae5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -163,7 +163,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct page *page, u64 file_offset, u64 len, + struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, From a79228011c75f9123ba2dbfd010cba27ea87b973 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 15:57:10 -0400 Subject: [PATCH 020/110] btrfs: convert btrfs_mark_ordered_io_finished() to take a folio We only need a folio now, make it take a folio as an argument and update all of the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/inode.c | 7 ++++--- fs/btrfs/ordered-data.c | 9 ++++----- fs/btrfs/ordered-data.h | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cf2022ea2d82..007b690b914c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1428,8 +1428,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, &folio->page, cur, - len, true); + btrfs_mark_ordered_io_finished(inode, folio, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1568,7 +1568,7 @@ done: folio_end_writeback(folio); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, page_start, PAGE_SIZE, !ret); mapping_set_error(folio->mapping, ret); } @@ -2330,7 +2330,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); } if (ret) { - btrfs_mark_ordered_io_finished(BTRFS_I(inode), &folio->page, + btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); mapping_set_error(mapping, ret); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1b6564ab68f..26a3d429d9a5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1144,7 +1144,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, set_page_writeback(locked_page); end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, locked_page, + btrfs_mark_ordered_io_finished(inode, + page_folio(locked_page), page_start, PAGE_SIZE, !ret); mapping_set_error(locked_page->mapping, ret); @@ -2802,8 +2803,8 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page, page_start, - PAGE_SIZE, !ret); + btrfs_mark_ordered_io_finished(inode, page_folio(page), + page_start, PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e97747956040..eb9b32ffbc0c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -449,8 +449,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, /* * Mark all ordered extents io inside the specified range finished. * - * @page: The involved page for the operation. - * For uncompressed buffered IO, the page status also needs to be + * @folio: The involved folio for the operation. + * For uncompressed buffered IO, the folio status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the @@ -460,7 +460,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, + struct folio *folio, u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; @@ -524,8 +524,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (can_finish_ordered_extent(entry, page_folio(page), cur, len, - uptodate)) { + if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&inode->ordered_tree_lock, flags); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 90c1c3c51ae5..4e152736d06c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -166,8 +166,8 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, struct folio *folio, u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, bool uptodate); + struct folio *folio, u64 file_offset, + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); From dc6c745447c57d115e48f23dbcad00c20c937cca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:03:04 -0400 Subject: [PATCH 021/110] btrfs: convert writepage_delalloc() to take a folio We already use a folio heavily in this function, pass the folio in directly and use it everywhere, only passing the page down to functions that do not take a folio yet. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 007b690b914c..1c784f7e6858 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1188,13 +1188,13 @@ int btrfs_read_folio(struct file *file, struct folio *folio) * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, - struct page *page, struct writeback_control *wbc) + struct folio *folio, + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); - struct folio *folio = page_folio(page); - const bool is_subpage = btrfs_is_subpage(fs_info, page->mapping); - const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; + const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const u64 page_start = folio_pos(folio); + const u64 page_end = page_start + folio_size(folio) - 1; /* * Save the last found delalloc end. As the delalloc end can go beyond * page boundary, thus we cannot rely on subpage bitmap to locate the @@ -1206,10 +1206,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; - /* Lock all (subpage) delalloc ranges inside the page first. */ + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, page, + if (!find_lock_delalloc_range(&inode->vfs_inode, &folio->page, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; @@ -1234,7 +1234,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (!is_subpage) { /* * For non-subpage case, the found delalloc range must - * cover this page and there must be only one locked + * cover this folio and there must be only one locked * delalloc range. */ found_start = page_start; @@ -1248,7 +1248,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, break; /* * The subpage range covers the last sector, the delalloc range may - * end beyond the page boundary, use the saved delalloc_end + * end beyond the folio boundary, use the saved delalloc_end * instead. */ if (found_start + found_len >= page_end) @@ -1256,7 +1256,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, page, found_start, + ret = btrfs_run_delalloc_range(inode, &folio->page, + found_start, found_start + found_len - 1, wbc); } else { @@ -1266,15 +1267,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, page, found_start, + __unlock_for_delalloc(&inode->vfs_inode, &folio->page, + found_start, found_start + found_len - 1); } /* * We can hit btrfs_run_delalloc_range() with >0 return value. * - * This happens when either the IO is already done and page - * unlocked (inline) or the IO submission and page unlock would + * This happens when either the IO is already done and folio + * unlocked (inline) or the IO submission and folio unlock would * be handled as async (compression). * * Inline is only possible for regular sectorsize for now. @@ -1282,14 +1284,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * Compression is possible for both subpage and regular cases, * but even for subpage compression only happens for page aligned * range, thus the found delalloc range must go beyond current - * page. + * folio. */ if (ret > 0) ASSERT(!is_subpage || found_start + found_len >= page_end); /* - * Above btrfs_run_delalloc_range() may have unlocked the page, - * thus for the last range, we cannot touch the page anymore. + * Above btrfs_run_delalloc_range() may have unlocked the folio, + * thus for the last range, we cannot touch the folio anymore. */ if (found_start + found_len >= last_delalloc_end + 1) break; @@ -1312,7 +1314,7 @@ out: /* * If btrfs_run_dealloc_range() already started I/O and unlocked - * the pages, we just need to account for them here. + * the folios, we just need to account for them here. */ if (ret == 1) { wbc->nr_to_write -= delalloc_to_write; @@ -1548,7 +1550,7 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), &folio->page, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl->wbc); if (ret == 1) return 0; if (ret) From c987f1e6d4435a6f1f62d82705ef8177823ae703 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:08:13 -0400 Subject: [PATCH 022/110] btrfs: convert find_lock_delalloc_range() to use a folio Instead of passing in a page for locked_page, pass in the folio instead. We only use the folio itself to validate some range assumptions, and then pass it into other functions. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 ++++++++++++++-------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +++++----- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c784f7e6858..b9921b8235e2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -304,8 +304,8 @@ out: */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, - u64 *end) + struct folio *locked_folio, + u64 *start, u64 *end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -323,9 +323,9 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, /* Caller should pass a valid @end to indicate the search range end */ ASSERT(orig_end > orig_start); - /* The range should at least cover part of the page */ - ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || - orig_end <= page_offset(locked_page))); + /* The range should at least cover part of the folio */ + ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) || + orig_end <= folio_pos(locked_folio))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; @@ -342,25 +342,25 @@ again: } /* - * start comes from the offset of locked_page. We have to lock - * pages in order, so we can't process delalloc bytes before - * locked_page + * start comes from the offset of locked_folio. We have to lock + * folios in order, so we can't process delalloc bytes before + * locked_folio */ if (delalloc_start < *start) delalloc_start = *start; /* - * make sure to limit the number of pages we try to lock down + * make sure to limit the number of folios we try to lock down */ if (delalloc_end + 1 - delalloc_start > max_bytes) delalloc_end = delalloc_start + max_bytes - 1; - /* step two, lock all the pages after the page that has start */ - ret = lock_delalloc_pages(inode, locked_page, + /* step two, lock all the folioss after the folios that has start */ + ret = lock_delalloc_pages(inode, &locked_folio->page, delalloc_start, delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { - /* some of the pages are gone, lets avoid looping by + /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ free_extent_state(cached_state); @@ -384,7 +384,7 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, locked_page, + __unlock_for_delalloc(inode, &locked_folio->page, delalloc_start, delalloc_end); cond_resched(); goto again; @@ -1209,7 +1209,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; - if (!find_lock_delalloc_range(&inode->vfs_inode, &folio->page, + if (!find_lock_delalloc_range(&inode->vfs_inode, folio, &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index dceebd76c7d1..1dd295e1b5a5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -368,7 +368,7 @@ int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, - struct page *locked_page, u64 *start, + struct folio *locked_folio, u64 *start, u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 865d4af4b303..0a2dbfaaf49e 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -180,7 +180,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -211,7 +211,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) } start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -266,7 +266,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); @@ -307,7 +307,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, locked_page, &start, + found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, &end); if (!found) { test_err("didn't find our range"); From e4d80ebe50de25f1a3c32b7a307bcfc52c9d21ae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:15:13 -0400 Subject: [PATCH 023/110] btrfs: convert lock_delalloc_pages() to take a folio Also rename lock_delalloc_pages => lock_delalloc_folios in the process, now that it exclusively works on folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b9921b8235e2..845c1c3efe8e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -230,10 +230,9 @@ static noinline void __unlock_for_delalloc(const struct inode *inode, PAGE_UNLOCK); } -static noinline int lock_delalloc_pages(struct inode *inode, - const struct page *locked_page, - u64 start, - u64 end) +static noinline int lock_delalloc_folios(struct inode *inode, + const struct folio *locked_folio, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct address_space *mapping = inode->i_mapping; @@ -243,7 +242,7 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_page->index && index == end_index) + if (index == locked_folio->index && index == end_index) return 0; folio_batch_init(&fbatch); @@ -257,23 +256,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; - if (page == locked_page) + if (folio == locked_folio) continue; if (btrfs_folio_start_writer_lock(fs_info, folio, start, len)) goto out; - if (!PageDirty(page) || page->mapping != mapping) { + if (!folio_test_dirty(folio) || folio->mapping != mapping) { btrfs_folio_end_writer_lock(fs_info, folio, start, len); goto out; } - processed_end = page_offset(page) + PAGE_SIZE - 1; + processed_end = folio_pos(folio) + folio_size(folio) - 1; } folio_batch_release(&fbatch); cond_resched(); @@ -283,7 +281,8 @@ static noinline int lock_delalloc_pages(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, locked_page, start, processed_end); + __unlock_for_delalloc(inode, &locked_folio->page, start, + processed_end); return -EAGAIN; } @@ -356,8 +355,8 @@ again: delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the folioss after the folios that has start */ - ret = lock_delalloc_pages(inode, &locked_folio->page, - delalloc_start, delalloc_end); + ret = lock_delalloc_folios(inode, locked_folio, delalloc_start, + delalloc_end); ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the folios are gone, lets avoid looping by From 79be4a28d834eacab2e827985f8fa7951a9122b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:17:43 -0400 Subject: [PATCH 024/110] btrfs: convert __unlock_for_delalloc() to take a folio All of the callers have a folio at this point, update __unlock_for_delalloc to take a folio so that it's consistent with its callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 845c1c3efe8e..79b621277577 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -216,18 +216,18 @@ static void __process_pages_contig(struct address_space *mapping, } static noinline void __unlock_for_delalloc(const struct inode *inode, - const struct page *locked_page, + const struct folio *locked_folio, u64 start, u64 end) { unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_page); - if (index == locked_page->index && end_index == index) + ASSERT(locked_folio); + if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK); + __process_pages_contig(inode->i_mapping, &locked_folio->page, start, + end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, @@ -281,7 +281,7 @@ static noinline int lock_delalloc_folios(struct inode *inode, out: folio_batch_release(&fbatch); if (processed_end > start) - __unlock_for_delalloc(inode, &locked_folio->page, start, + __unlock_for_delalloc(inode, locked_folio, start, processed_end); return -EAGAIN; } @@ -383,8 +383,8 @@ again: unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - __unlock_for_delalloc(inode, &locked_folio->page, - delalloc_start, delalloc_end); + __unlock_for_delalloc(inode, locked_folio, delalloc_start, + delalloc_end); cond_resched(); goto again; } @@ -1266,7 +1266,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, */ unlock_extent(&inode->io_tree, found_start, found_start + found_len - 1, NULL); - __unlock_for_delalloc(&inode->vfs_inode, &folio->page, + __unlock_for_delalloc(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); } From a59ff7201a1586a76035140c8871e0e8c3a89449 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:20:02 -0400 Subject: [PATCH 025/110] btrfs: convert __process_pages_contig() to take a folio This operates mostly on folios, update it to take a folio for the locked folio instead of the page, rename from __process_pages_contig => __process_folios_contig. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 79b621277577..c91ec5661ee3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -187,9 +187,9 @@ static void process_one_page(struct btrfs_fs_info *fs_info, btrfs_folio_end_writer_lock(fs_info, folio, start, len); } -static void __process_pages_contig(struct address_space *mapping, - const struct page *locked_page, u64 start, u64 end, - unsigned long page_ops) +static void __process_folios_contig(struct address_space *mapping, + const struct folio *locked_folio, u64 start, + u64 end, unsigned long page_ops) { struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host); pgoff_t start_index = start >> PAGE_SHIFT; @@ -207,8 +207,9 @@ static void __process_pages_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, locked_page, - page_ops, start, end); + process_one_page(fs_info, &folio->page, + &locked_folio->page, page_ops, start, + end); } folio_batch_release(&fbatch); cond_resched(); @@ -226,8 +227,8 @@ static noinline void __unlock_for_delalloc(const struct inode *inode, if (index == locked_folio->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, &locked_folio->page, start, - end, PAGE_UNLOCK); + __process_folios_contig(inode->i_mapping, locked_folio, start, end, + PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, @@ -401,8 +402,8 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, + page_folio(locked_page), start, end, page_ops); } static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) From c9ce51d67f385f513502a95bdc647c62345cb612 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:22:15 -0400 Subject: [PATCH 026/110] btrfs: convert process_one_page() to operate only on folios Now that this mostly uses folios, update it to take folios, use the folios that are passed in, and rename from process_one_page => process_one_folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c91ec5661ee3..d73496ce7f52 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -164,11 +164,10 @@ void __cold extent_buffer_free_cachep(void) kmem_cache_destroy(extent_buffer_cache); } -static void process_one_page(struct btrfs_fs_info *fs_info, - struct page *page, const struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_folio(struct btrfs_fs_info *fs_info, + struct folio *folio, const struct folio *locked_folio, + unsigned long page_ops, u64 start, u64 end) { - struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); @@ -183,7 +182,7 @@ static void process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); - if (page != locked_page && (page_ops & PAGE_UNLOCK)) + if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) btrfs_folio_end_writer_lock(fs_info, folio, start, len); } @@ -207,9 +206,8 @@ static void __process_folios_contig(struct address_space *mapping, for (i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; - process_one_page(fs_info, &folio->page, - &locked_folio->page, page_ops, start, - end); + process_one_folio(fs_info, folio, locked_folio, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); From a67f5405827e5259dfe8a094f1f13e7b48d675df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:29:18 -0400 Subject: [PATCH 027/110] btrfs: convert extent_clear_unlock_delalloc() to take a folio Instead of taking the locked page, take the locked folio so we can pass that into __process_folios_contig. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 25 ++++++++++++++----------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d73496ce7f52..a1ba45690635 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -394,14 +394,14 @@ out_failed: } void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); - __process_folios_contig(inode->vfs_inode.i_mapping, - page_folio(locked_page), start, end, page_ops); + __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, + end, page_ops); } static bool btrfs_verify_folio(struct folio *folio, u64 start, u32 len) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1dd295e1b5a5..5d36031578ff 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -354,7 +354,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - const struct page *locked_page, + const struct folio *locked_folio, struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 26a3d429d9a5..3dff49ba4453 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -743,10 +743,10 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (ret == 0) locked_page = NULL; - extent_clear_unlock_delalloc(inode, offset, end, locked_page, &cached, - clear_flags, - PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, offset, end, + page_folio(locked_page), &cached, + clear_flags, PAGE_UNLOCK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } @@ -1501,7 +1501,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, &cached, + page_folio(locked_page), &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1560,7 +1560,8 @@ out_unlock: if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, NULL, 0, page_ops); + page_folio(locked_page), NULL, 0, + page_ops); } /* @@ -1583,7 +1584,7 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, &cached, + page_folio(locked_page), &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); @@ -1598,8 +1599,9 @@ out_unlock: */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - &cached, clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, + page_folio(locked_page), &cached, + clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; @@ -2207,7 +2209,8 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, &cached_state, + page_folio(locked_page), + &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2256,7 +2259,7 @@ error: lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, &cached, + page_folio(locked_page), &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | From 01e11841f0cf6c102821a7c8d9f7f49e7b2d0b5b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:31:44 -0400 Subject: [PATCH 028/110] btrfs: convert extent_write_locked_range() to take a folio This mostly uses folios, convert it to take a folio instead and update the callers to pass in the folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1ba45690635..9ae17c9fd89b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2274,7 +2274,7 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -2316,7 +2316,7 @@ void extent_write_locked_range(struct inode *inode, const struct page *locked_pa } ASSERT(folio_test_locked(folio)); - if (pages_dirty && &folio->page != locked_page) + if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5d36031578ff..b38460279b99 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -240,7 +240,7 @@ bool try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -void extent_write_locked_range(struct inode *inode, const struct page *locked_page, +void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3dff49ba4453..5761ccc92a44 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1758,7 +1758,8 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, locked_page, start, + extent_write_locked_range(&inode->vfs_inode, + page_folio(locked_page), start, done_offset, wbc, pages_dirty); start = done_offset + 1; } From 2cdc1fbb1b1558b5ca1fe81d4fee723f09940bba Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:34:15 -0400 Subject: [PATCH 029/110] btrfs: convert run_delalloc_cow() to take a folio We pass the folio into extent_write_locked_range, go ahead and take a folio to pass along, and update the callers to pass in a folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5761ccc92a44..bcbaf2e65e5a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -116,7 +116,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); @@ -1135,7 +1135,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, + &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -1746,7 +1747,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { @@ -1754,13 +1755,12 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, &done_offset, - true, false); + ret = cow_file_range(inode, &locked_folio->page, start, end, + &done_offset, true, false); if (ret) return ret; - extent_write_locked_range(&inode->vfs_inode, - page_folio(locked_page), start, - done_offset, wbc, pages_dirty); + extent_write_locked_range(&inode->vfs_inode, locked_folio, + start, done_offset, wbc, pages_dirty); start = done_offset + 1; } @@ -2311,8 +2311,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return 1; if (zoned) - ret = run_delalloc_cow(inode, locked_page, start, end, wbc, - true); + ret = run_delalloc_cow(inode, page_folio(locked_page), start, + end, wbc, true); else ret = cow_file_range(inode, locked_page, start, end, NULL, false, false); From 9f5db28074ade4edf8cf081927134f7eebd912f3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 26 Jul 2024 15:25:45 -0400 Subject: [PATCH 030/110] btrfs: convert cow_file_range_inline() to take a folio Now that we want the folio in this function, convert it to take a folio directly and use that. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bcbaf2e65e5a..fd3232472d32 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -715,7 +715,7 @@ out: } static noinline int cow_file_range_inline(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 end, size_t compressed_size, int compress_type, @@ -741,10 +741,9 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, } if (ret == 0) - locked_page = NULL; + locked_folio = NULL; - extent_clear_unlock_delalloc(inode, offset, end, - page_folio(locked_page), &cached, + extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; @@ -1365,8 +1364,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, locked_page, start, end, 0, - BTRFS_COMPRESS_NONE, NULL, false); + ret = cow_file_range_inline(inode, page_folio(locked_page), + start, end, 0, BTRFS_COMPRESS_NONE, + NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done From 4cf7e0562f5f1819d658f8bc6349a787bafc9da6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:37:29 -0400 Subject: [PATCH 031/110] btrfs: convert cow_file_range() to take a folio Convert this to take a folio and pass it into all of the various cleanup functions. Update the callers to pass in a folio instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fd3232472d32..6047fd35fe43 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1307,21 +1307,21 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * - * locked_page is the page that writepage had locked already. We use + * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * When this function fails, it unlocks all pages except @locked_page. + * When this function fails, it unlocks all pages except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and - * unlocks all pages including locked_page and starts I/O on them. - * (In reality inline extents are limited to a single page, so locked_page is + * unlocks all pages including locked_folio and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_folio is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. - * - Else all pages except for @locked_page are unlocked. + * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1330,8 +1330,8 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, u64 start, u64 end, - u64 *done_offset, + struct folio *locked_folio, u64 start, + u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; @@ -1364,9 +1364,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, page_folio(locked_page), - start, end, 0, BTRFS_COMPRESS_NONE, - NULL, false); + ret = cow_file_range_inline(inode, locked_folio, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done @@ -1502,7 +1501,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - page_folio(locked_page), &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1555,14 +1554,13 @@ out_unlock: * function. * * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_page) to ensure all the pages are unlocked. + * (except @locked_folio) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { - if (!locked_page) + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - page_folio(locked_page), NULL, 0, - page_ops); + locked_folio, NULL, 0, page_ops); } /* @@ -1585,8 +1583,7 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - page_folio(locked_page), &cached, - clear_bits, + locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; @@ -1600,9 +1597,8 @@ out_unlock: */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; - extent_clear_unlock_delalloc(inode, start, end, - page_folio(locked_page), &cached, - clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, locked_folio, + &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } return ret; @@ -1755,7 +1751,7 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, int ret; while (start <= end) { - ret = cow_file_range(inode, &locked_folio->page, start, end, + ret = cow_file_range(inode, locked_folio, start, end, &done_offset, true, false); if (ret) return ret; @@ -1837,7 +1833,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ret = cow_file_range(inode, page_folio(locked_page), start, end, NULL, + false, true); ASSERT(ret != 1); return ret; } @@ -2314,8 +2311,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, wbc, true); else - ret = cow_file_range(inode, locked_page, start, end, NULL, - false, false); + ret = cow_file_range(inode, page_folio(locked_page), start, end, + NULL, false, false); out: if (ret < 0) From 39bbc56a9cb135a32d29ea534a9f219c4c406ea7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:40:34 -0400 Subject: [PATCH 032/110] btrfs: convert fallback_to_cow() to take a folio With this we can pass the folio directly into cow_file_range(). Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6047fd35fe43..534b1dec3752 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1763,8 +1763,9 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, return 1; } -static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end) +static int fallback_to_cow(struct btrfs_inode *inode, + struct folio *locked_folio, const u64 start, + const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1833,8 +1834,8 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ - ret = cow_file_range(inode, page_folio(locked_page), start, end, NULL, - false, true); + ret = cow_file_range(inode, locked_folio, start, end, NULL, false, + true); ASSERT(ret != 1); return ret; } @@ -2151,7 +2152,7 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, + ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, found_key.offset - 1); cow_start = (u64)-1; if (ret) { @@ -2230,7 +2231,8 @@ must_cow: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end); + ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, + end); cow_start = (u64)-1; if (ret) goto error; From 42a5947b1c21d2cd156607058f0b844012ac7b6e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:42:23 -0400 Subject: [PATCH 033/110] btrfs: convert run_delalloc_nocow() to take a folio Now all of the functions that use locked_page in run_delalloc_nocow take a folio, update it to take a folio and update the caller. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 534b1dec3752..9fae22af047e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1989,7 +1989,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -2152,8 +2152,8 @@ must_cow: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, page_folio(locked_page), - cow_start, found_key.offset - 1); + ret = fallback_to_cow(inode, locked_folio, cow_start, + found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); @@ -2208,8 +2208,7 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - page_folio(locked_page), - &cached_state, + locked_folio, &cached_state, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2231,8 +2230,7 @@ must_cow: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, page_folio(locked_page), cow_start, - end); + ret = fallback_to_cow(inode, locked_folio, cow_start, end); cow_start = (u64)-1; if (ret) goto error; @@ -2259,7 +2257,7 @@ error: lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - page_folio(locked_page), &cached, + locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -2300,7 +2298,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, locked_page, start, end); + ret = run_delalloc_nocow(inode, page_folio(locked_page), start, + end); goto out; } From b38ec94ab95b6aa0c6d636ff264a3b150a32c8ca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:46:01 -0400 Subject: [PATCH 034/110] btrfs: convert btrfs_cleanup_ordered_extents() to use folios We walk through pages in this function and clear ordered, and the function for this uses folios. Update the function to use a folio for this whole operation. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9fae22af047e..d3345c323ba5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -399,7 +399,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; - struct page *page; + struct folio *folio; if (locked_page) { page_start = page_offset(locked_page); @@ -421,9 +421,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, index++; continue; } - page = find_get_page(inode->vfs_inode.i_mapping, index); + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); index++; - if (!page) + if (IS_ERR(folio)) continue; /* @@ -431,9 +431,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_folio_clamp_clear_ordered(inode->root->fs_info, - page_folio(page), offset, bytes); - put_page(page); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, + offset, bytes); + folio_put(folio); } if (locked_page) { From 94cea66d1c742585a9d6ffc7345d816e6a7e1f1b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:49:54 -0400 Subject: [PATCH 035/110] btrfs: convert btrfs_cleanup_ordered_extents() to take a folio Now that btrfs_cleanup_ordered_extents is operating mostly with folios, update it to use a folio instead of a page, and the update the function and the callers as appropriate. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3345c323ba5..15b3e368ce7f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -393,7 +393,7 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct page *locked_page, + struct folio *locked_folio, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; @@ -401,9 +401,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, u64 page_start = 0, page_end = 0; struct folio *folio; - if (locked_page) { - page_start = page_offset(locked_page); - page_end = page_start + PAGE_SIZE - 1; + if (locked_folio) { + page_start = folio_pos(locked_folio); + page_end = page_start + folio_size(locked_folio) - 1; } while (index <= end_index) { @@ -417,7 +417,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (locked_page && index == (page_start >> PAGE_SHIFT)) { + if (locked_folio && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -436,9 +436,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_page) { + if (locked_folio) { /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + PAGE_SIZE) + if (bytes + offset <= page_start + folio_size(locked_folio)) return; /* * In case this page belongs to the delalloc range being @@ -447,8 +447,9 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + bytes = offset + bytes - folio_pos(locked_folio) - + folio_size(locked_folio); + offset = folio_pos(locked_folio) + folio_size(locked_folio); } } @@ -1138,7 +1139,8 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); @@ -2317,8 +2319,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_page, start, - end - start + 1); + btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + start, end - start + 1); return ret; } From d9c750272d94792266b3f0c816d3a927e964bc78 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:52:57 -0400 Subject: [PATCH 036/110] btrfs: convert run_delalloc_compressed() to take a folio This just passes the page into the compressed machinery to keep track of the locked page. Update this to take a folio and convert it to a page where appropriate. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 15b3e368ce7f..3ee2c863adee 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1653,7 +1653,7 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct page *locked_page, u64 start, + struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1693,15 +1693,16 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, INIT_LIST_HEAD(&async_chunk[i].extents); /* - * The locked_page comes all the way from writepage and its - * the original page we were actually given. As we spread + * The locked_folio comes all the way from writepage and its + * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk - * structs, only the first struct needs a pointer to locked_page + * structs, only the first struct needs a pointer to + * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ - if (locked_page) { + if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to @@ -1711,10 +1712,10 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, locked_page, + wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = locked_page; - locked_page = NULL; + async_chunk[i].locked_page = &locked_folio->page; + locked_folio = NULL; } else { async_chunk[i].locked_page = NULL; } @@ -2307,7 +2308,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, locked_page, start, end, wbc)) + run_delalloc_compressed(inode, page_folio(locked_page), start, end, + wbc)) return 1; if (zoned) From 2609c9289f423e3d2d4044ddace4a0fb1939e9c3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 16:56:32 -0400 Subject: [PATCH 037/110] btrfs: convert btrfs_run_delalloc_range() to take a folio Now that every function that btrfs_run_delalloc_range calls takes a folio, update it to take a folio and update the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 26 ++++++++++++-------------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 3056c8aed8ef..5599b458a9a9 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -596,7 +596,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9ae17c9fd89b..5ff38e3f28e6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1254,7 +1254,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, if (ret >= 0) { /* No errors hit so far, run the current delalloc range. */ - ret = btrfs_run_delalloc_range(inode, &folio->page, + ret = btrfs_run_delalloc_range(inode, folio, found_start, found_start + found_len - 1, wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ee2c863adee..e189dc9b6a3b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2287,42 +2287,40 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* - * The range must cover part of the @locked_page, or a return of 1 + * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= page_offset(locked_page) || - start >= page_offset(locked_page) + PAGE_SIZE)); + ASSERT(!(end <= folio_pos(locked_folio) || + start >= folio_pos(locked_folio) + folio_size(locked_folio))); if (should_nocow(inode, start, end)) { - ret = run_delalloc_nocow(inode, page_folio(locked_page), start, - end); + ret = run_delalloc_nocow(inode, locked_folio, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, page_folio(locked_page), start, end, - wbc)) + run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) - ret = run_delalloc_cow(inode, page_folio(locked_page), start, - end, wbc, true); + ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, + true); else - ret = cow_file_range(inode, page_folio(locked_page), start, end, - NULL, false, false); + ret = cow_file_range(inode, locked_folio, start, end, NULL, + false, false); out: if (ret < 0) - btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), - start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_folio, start, + end - start + 1); return ret; } From 3ed984b5d0cccdfe273e29bd19e588a704bc4b93 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:09:33 -0400 Subject: [PATCH 038/110] btrfs: convert struct async_chunk to hold a folio Instead of passing in the page for ->locked_page, make it hold a locked_folio and then update the users of async_chunk to act accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e189dc9b6a3b..ceb7144ed0de 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -762,7 +762,7 @@ struct async_extent { struct async_chunk { struct btrfs_inode *inode; - struct page *locked_page; + struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; @@ -1167,7 +1167,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; @@ -1178,19 +1178,20 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, kthread_associate_blkcg(async_chunk->blkcg_css); /* - * If async_chunk->locked_page is in the async_extent range, we need to + * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ - if (async_chunk->locked_page) { - u64 locked_page_start = page_offset(async_chunk->locked_page); - u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; + if (async_chunk->locked_folio) { + u64 locked_folio_start = folio_pos(async_chunk->locked_folio); + u64 locked_folio_end = locked_folio_start + + folio_size(async_chunk->locked_folio) - 1; - if (!(start >= locked_page_end || end <= locked_page_start)) - locked_page = async_chunk->locked_page; + if (!(start >= locked_folio_end || end <= locked_folio_start)) + locked_folio = async_chunk->locked_folio; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, &locked_folio->page); goto done; } @@ -1205,7 +1206,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, locked_page); + submit_uncompressed_range(inode, async_extent, + &locked_folio->page); goto done; } @@ -1714,10 +1716,10 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, */ wbc_account_cgroup_owner(wbc, &locked_folio->page, cur_end - start); - async_chunk[i].locked_page = &locked_folio->page; + async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; } else { - async_chunk[i].locked_page = NULL; + async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { From 0d1170681098c4747dd15d44741dc10e83229a58 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:13:17 -0400 Subject: [PATCH 039/110] btrfs: convert submit_uncompressed_range() to take a folio This mostly uses folios already, update it to take a folio and update the rest of the function to use the folio instead of the page. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ceb7144ed0de..b6baa78e6573 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1122,7 +1122,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, - struct page *locked_page) + struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1135,23 +1135,22 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = run_delalloc_cow(inode, page_folio(locked_page), start, end, + ret = run_delalloc_cow(inode, locked_folio, start, end, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, page_folio(locked_page), + btrfs_cleanup_ordered_extents(inode, locked_folio, start, end - start + 1); - if (locked_page) { - const u64 page_start = page_offset(locked_page); + if (locked_folio) { + const u64 page_start = folio_pos(locked_folio); - set_page_writeback(locked_page); - end_page_writeback(locked_page); - btrfs_mark_ordered_io_finished(inode, - page_folio(locked_page), + folio_start_writeback(locked_folio); + folio_end_writeback(locked_folio); + btrfs_mark_ordered_io_finished(inode, locked_folio, page_start, PAGE_SIZE, !ret); - mapping_set_error(locked_page->mapping, ret); - unlock_page(locked_page); + mapping_set_error(locked_folio->mapping, ret); + folio_unlock(locked_folio); } } } @@ -1191,7 +1190,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { - submit_uncompressed_range(inode, async_extent, &locked_folio->page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } @@ -1206,8 +1205,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ - submit_uncompressed_range(inode, async_extent, - &locked_folio->page); + submit_uncompressed_range(inode, async_extent, locked_folio); goto done; } From 7d003cc2b3ef45e3ad78ec48ddeaaaae19f734cc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:21:25 -0400 Subject: [PATCH 040/110] btrfs: convert btrfs_writepage_fixup_worker() to use a folio This function heavily messes with pages, instead update it to use a folio. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 54 +++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b6baa78e6573..da9b31a6cc14 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2708,49 +2708,51 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page = fixup->page; + struct folio *folio = page_folio(page); struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 page_start = page_offset(page); - u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 page_start = folio_pos(folio); + u64 page_end = folio_pos(folio) + folio_size(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before - * we take the page lock. + * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + folio_size(folio)); again: - lock_page(page); + folio_lock(folio); /* - * Before we queued this fixup, we took a reference on the page. - * page->mapping may go NULL, but it shouldn't be moved to a different + * Before we queued this fixup, we took a reference on the folio. + * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ - if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + if (!folio->mapping || !folio_test_dirty(folio) || + !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * - * 1) We got here and our page had already been dealt with and + * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. - * 2) Our page was already dealt with, but we happened to get an + * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but - * because the page was already dealt with we don't want to - * mark the page with an error, so make sure we're resetting + * because the folio was already dealt with we don't want to + * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC - * when the page was already properly dealt with. + * when the folio was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE, + page_start, folio_size(folio), true); } ret = 0; @@ -2758,7 +2760,7 @@ again: } /* - * We can't mess with the page state unless it is locked, so now that + * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) @@ -2767,14 +2769,14 @@ again: lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PageOrdered(page)) + if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); - unlock_page(page); + folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; @@ -2792,7 +2794,7 @@ again: * * The page was dirty when we started, nothing should have cleaned it. */ - BUG_ON(!PageDirty(page)); + BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); @@ -2806,14 +2808,14 @@ out_page: * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ - mapping_set_error(page->mapping, ret); - btrfs_mark_ordered_io_finished(inode, page_folio(page), - page_start, PAGE_SIZE, !ret); - clear_page_dirty_for_io(page); + mapping_set_error(folio->mapping, ret); + btrfs_mark_ordered_io_finished(inode, folio, page_start, + folio_size(folio), !ret); + folio_clear_dirty_for_io(folio); } - btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); - unlock_page(page); - put_page(page); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + folio_unlock(folio); + folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* From d71b53c3cb0ad0df8c0dfc1ea9a6507e010794da Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:26:31 -0400 Subject: [PATCH 041/110] btrfs: convert btrfs_writepage_cow_fixup() to use folio Instead of a page, use a folio for btrfs_writepage_cow_fixup. We already have a folio at the only caller, and the fixup worker uses folios. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 31 ++++++++++++++++--------------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5599b458a9a9..fc60c0cde479 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -598,7 +598,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); +int btrfs_writepage_cow_fixup(struct folio *folio); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5ff38e3f28e6..6ba8867d2a1f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1410,7 +1410,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ASSERT(start >= folio_pos(folio) && start + len <= folio_pos(folio) + folio_size(folio)); - ret = btrfs_writepage_cow_fixup(&folio->page); + ret = btrfs_writepage_cow_fixup(folio); if (ret) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da9b31a6cc14..e81b221d33a7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2828,33 +2828,34 @@ out_page: /* * There are a few paths in the higher layers of the kernel that directly - * set the page dirty bit without asking the filesystem if it is a + * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set - * the delalloc bit and make it safe to write the page. + * the delalloc bit and make it safe to write the folio. */ -int btrfs_writepage_cow_fixup(struct page *page) +int btrfs_writepage_cow_fixup(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; - /* This page has ordered extent covering it already */ - if (PageOrdered(page)) + /* This folio has ordered extent covering it already */ + if (folio_test_ordered(folio)) return 0; /* - * PageChecked is set below when we create a fixup worker for this page, - * don't try to create another one if we're already PageChecked() + * folio_checked is set below when we create a fixup worker for this + * folio, don't try to create another one if we're already + * folio_test_checked. * - * The extent_io writepage code will redirty the page if we send back + * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ - if (PageChecked(page)) + if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); @@ -2864,14 +2865,14 @@ int btrfs_writepage_cow_fixup(struct page *page) /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation - * takes place outside of the page lock, and we can't trust - * page->mapping outside of the page lock. + * takes place outside of the folio lock, and we can't trust + * page->mapping outside of the folio lock. */ ihold(inode); - btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); - get_page(page); + btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); + folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = page; + fixup->page = &folio->page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); From 1b5125bbd42541d4fb954e9636284d0387d2b7f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:28:05 -0400 Subject: [PATCH 042/110] btrfs: convert struct btrfs_writepage_fixup to use a folio Now the fixup creator and consumer use folios, change this to use a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e81b221d33a7..8ca07ca7dcd0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2695,7 +2695,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { - struct page *page; + struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2707,8 +2707,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page = fixup->page; - struct folio *folio = page_folio(page); + struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); @@ -2872,7 +2871,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); - fixup->page = &folio->page; + fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); From 752965824b6d7e1d7e144985dbd48e68f742ae35 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 17:38:46 -0400 Subject: [PATCH 043/110] btrfs: convert uncompress_inline() to take a folio Update uncompress_inline to take a folio and update it's usage accordingly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ca07ca7dcd0..29b5c257f95a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6707,7 +6707,7 @@ static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, } static noinline int uncompress_inline(struct btrfs_path *path, - struct page *page, + struct folio *folio, struct btrfs_file_extent_item *item) { int ret; @@ -6729,7 +6729,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, &folio->page, 0, inline_size, + max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6740,7 +6741,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, */ if (max_size < PAGE_SIZE) - memzero_page(page, max_size, PAGE_SIZE - max_size); + folio_zero_range(folio, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } @@ -6760,7 +6761,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page, fi); + return uncompress_inline(path, page_folio(page), fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); From 220e77c412d342c5961b6d5440935c2e466724b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:25:10 -0400 Subject: [PATCH 044/110] btrfs: convert read_inline_extent() to use a folio Instead of using a page, use a folio instead, take a folio as an argument, and update the callers appropriately. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29b5c257f95a..eb3c8ccf7337 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6747,30 +6747,30 @@ static noinline int uncompress_inline(struct btrfs_path *path, } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, - struct page *page) + struct folio *folio) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; - if (!page || PageUptodate(page)) + if (!folio || folio_test_uptodate(folio)) return 0; - ASSERT(page_offset(page) == 0); + ASSERT(folio_pos(folio) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) - return uncompress_inline(path, page_folio(page), fi); + return uncompress_inline(path, folio, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) - memzero_page(page, copy_size, PAGE_SIZE - copy_size); + folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); return 0; } @@ -6945,7 +6945,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page); + ret = read_inline_extent(inode, path, page_folio(page)); if (ret < 0) goto out; goto insert; From dce9ef941205db02c8e7b01e0091f8115d024be1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:26:58 -0400 Subject: [PATCH 045/110] btrfs: convert btrfs_get_extent() to take a folio We only pass this into read_inline_extent, change it to take a folio and update the callers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index fc60c0cde479..2d7f8da54d8a 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -578,7 +578,7 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct btrfs_path *path); struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len); + struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6ba8867d2a1f..f4eb44298bd8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -987,7 +987,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page_folio(page), start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eb3c8ccf7337..05e74ef06896 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6792,7 +6792,7 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, u64 start, u64 len) + struct folio *folio, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; @@ -6815,7 +6815,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); - else if (em->disk_bytenr == EXTENT_MAP_INLINE && page) + else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) free_extent_map(em); else goto out; @@ -6945,7 +6945,7 @@ next: ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); - ret = read_inline_extent(inode, path, page_folio(page)); + ret = read_inline_extent(inode, path, folio); if (ret < 0) goto out; goto insert; From 7ed07d16624a6452ad048f9e1186eafd600582e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 19:28:26 -0400 Subject: [PATCH 046/110] btrfs: convert __get_extent_map() to take a folio Now that btrfs_get_extent takes a folio, update __get_extent_map to take a folio as well. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f4eb44298bd8..782a38370d03 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -968,8 +968,9 @@ void clear_page_extent_mapped(struct page *page) folio_detach_private(folio); } -static struct extent_map *__get_extent_map(struct inode *inode, struct page *page, - u64 start, u64 len, struct extent_map **em_cached) +static struct extent_map *__get_extent_map(struct inode *inode, + struct folio *folio, u64 start, + u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -987,7 +988,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, struct page *pag *em_cached = NULL; } - em = btrfs_get_extent(BTRFS_I(inode), page_folio(page), start, len); + em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -1050,8 +1051,8 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, end_folio_read(folio, true, cur, iosize); break; } - em = __get_extent_map(inode, folio_page(folio, 0), cur, - end - cur + 1, em_cached); + em = __get_extent_map(inode, folio, cur, end - cur + 1, + em_cached); if (IS_ERR(em)) { unlock_extent(tree, cur, end, NULL); end_folio_read(folio, false, cur, end + 1 - cur); From 1a48259d9b6a4331a932700cc884681433b26244 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:17:26 -0400 Subject: [PATCH 047/110] btrfs: convert find_next_dirty_byte() to take a folio We already use a folio some in this function, replace all page usage with the folio and update the function to take the folio as an argument. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 782a38370d03..e56b62746a15 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1348,9 +1348,8 @@ out: * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. */ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct page *page, u64 *start, u64 *end) + struct folio *folio, u64 *start, u64 *end) { - struct folio *folio = page_folio(page); struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; @@ -1363,14 +1362,15 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (!btrfs_is_subpage(fs_info, page->mapping)) { - *start = page_offset(page); - *end = page_offset(page) + PAGE_SIZE; + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + *start = folio_pos(folio); + *end = folio_pos(folio) + folio_size(folio); return; } range_start_bit = spi->dirty_offset + - (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + (offset_in_folio(folio, orig_start) >> + fs_info->sectorsize_bits); /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); @@ -1381,8 +1381,8 @@ static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, range_start_bit -= spi->dirty_offset; range_end_bit -= spi->dirty_offset; - *start = page_offset(page) + range_start_bit * fs_info->sectorsize; - *end = page_offset(page) + range_end_bit * fs_info->sectorsize; + *start = folio_pos(folio) + range_start_bit * fs_info->sectorsize; + *end = folio_pos(folio) + range_end_bit * fs_info->sectorsize; } /* @@ -1443,7 +1443,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, break; } - find_next_dirty_byte(fs_info, &folio->page, &dirty_range_start, + find_next_dirty_byte(fs_info, folio, &dirty_range_start, &dirty_range_end); if (cur < dirty_range_start) { cur = dirty_range_start; From dfc9e3017aa71211a11d0b479552af6ee3f9d9b2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:20:24 -0400 Subject: [PATCH 048/110] btrfs: convert wait_subpage_spinlock() to only use a folio Currently this already uses a folio for most things, update it to take a folio and update all the page usage with the corresponding folio usage. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 05e74ef06896..0e5db913d6bb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7187,13 +7187,12 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ -static void wait_subpage_spinlock(struct page *page) +static void wait_subpage_spinlock(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page->mapping)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7223,7 +7222,7 @@ static int btrfs_launder_folio(struct folio *folio) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); clear_page_extent_mapped(&folio->page); return true; } @@ -7284,7 +7283,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); - wait_subpage_spinlock(&folio->page); + wait_subpage_spinlock(folio); /* * For subpage case, we have call sites like From 1bbf3a3aea3b5f40ad25edfe11bf652fed1b730d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:22:32 -0400 Subject: [PATCH 049/110] btrfs: convert btrfs_set_range_writeback() to use a folio We already use a lot of functions here that use folios, update the function to use __filemap_get_folio instead of find_get_page and then use the folio directly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e5db913d6bb..aece69dc41ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8958,19 +8958,19 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->vfs_inode.i_mapping, index); - ASSERT(page); /* Pages should be in the extent_io_tree */ + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); + ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */ /* This is for data, which doesn't yet support larger folio. */ - ASSERT(folio_order(page_folio(page)) == 0); - btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); - put_page(page); + ASSERT(folio_order(folio) == 0); + btrfs_folio_set_writeback(fs_info, folio, start, len); + folio_put(folio); index++; } } From c86d3aac8146ea5df911a037b9cf32881783d4e9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:37:20 -0400 Subject: [PATCH 050/110] btrfs: convert insert_inline_extent() to use a folio We only use a page to copy in the data for the inline extent. Use a folio for this instead. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aece69dc41ec..cd9290f86a4c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -495,7 +495,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct page *page = NULL; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; @@ -555,12 +554,16 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { - page = find_get_page(inode->vfs_inode.i_mapping, 0); + struct folio *folio; + + folio = __filemap_get_folio(inode->vfs_inode.i_mapping, + 0, 0, 0); + ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); - put_page(page); + folio_put(folio); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); From 5fe191244955f334e35bc4ebaadf3300f22b6b41 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 24 Jul 2024 20:39:35 -0400 Subject: [PATCH 051/110] btrfs: convert extent_range_clear_dirty_for_io() to use a folio Instead of getting a page and using that to clear dirty for io, use the folio helper and use the appropriate folio functions. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd9290f86a4c..a9656e5529fb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -877,19 +877,19 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int ret = 0; for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - page = find_get_page(inode->i_mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); + if (IS_ERR(folio)) { if (!ret) - ret = -ENOENT; + ret = PTR_ERR(folio); continue; } - clear_page_dirty_for_io(page); - put_page(page); + folio_clear_dirty_for_io(folio); + folio_put(folio); } return ret; } From b79f1c2caadc5c6251241977c7987fefdeadc2d9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Jul 2024 10:46:01 +0100 Subject: [PATCH 052/110] btrfs: reschedule when updating chunk maps at the end of a device replace At the end of a device replace we must go over all the chunk maps and update their stripes to point to the target device instead of the source device. We iterate over the chunk maps while holding a write lock and we never reschedule, which can result in monopolizing a CPU for too long and blocking readers for too long (it's a rw lock, non-blocking). So improve on this by rescheduling if necessary. This is safe because at this point we are holding the chunk mutex, which means no new chunks can be allocated and therefore we don't risk missing a new chunk map that covers a range behind the last one we processed before rescheduling. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f638c458d285..20cf5e95f2bc 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -827,6 +827,14 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( u64 start = 0; int i; + /* + * The chunk mutex must be held so that no new chunks can be created + * while we are updating existing chunks. This guarantees we don't miss + * any new chunk that gets created for a range that falls before the + * range of the last chunk we processed. + */ + lockdep_assert_held(&fs_info->chunk_mutex); + write_lock(&fs_info->mapping_tree_lock); do { struct btrfs_chunk_map *map; @@ -839,6 +847,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( map->stripes[i].dev = tgtdev; start = map->start + map->chunk_len; btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } while (start); write_unlock(&fs_info->mapping_tree_lock); } From 68a505bb87f948f72e5d230dfd7b40debdb195ed Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 25 Jul 2024 11:48:10 +0100 Subject: [PATCH 053/110] btrfs: more efficient chunk map iteration when device replace finishes When iterating the chunk maps when a device replace finishes we are doing a full rbtree search for each chunk map, which is not the most efficient thing to do, wasting CPU time. As we are holding a write lock on the tree during the whole iteration, we can simply start from the first node in the tree and then move to the next chunk map by doing a rb_next() call - the only exception is when we need to reschedule, in which case we have to do a full rbtree search since we dropped the write lock and the tree may have changed (chunk maps may have been removed and the tree got rebalanced). So just do that. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 20cf5e95f2bc..83d5cdd77f29 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -824,8 +824,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - u64 start = 0; - int i; + struct rb_node *node; /* * The chunk mutex must be held so that no new chunks can be created @@ -836,19 +835,34 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( lockdep_assert_held(&fs_info->chunk_mutex); write_lock(&fs_info->mapping_tree_lock); - do { + node = rb_first_cached(&fs_info->mapping_tree); + while (node) { + struct rb_node *next = rb_next(node); struct btrfs_chunk_map *map; + u64 next_start; - map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); - if (!map) - break; - for (i = 0; i < map->num_stripes; i++) + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + next_start = map->start + map->chunk_len; + + for (int i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); - cond_resched_rwlock_write(&fs_info->mapping_tree_lock); - } while (start); + + if (cond_resched_rwlock_write(&fs_info->mapping_tree_lock)) { + map = btrfs_find_chunk_map_nolock(fs_info, next_start, U64_MAX); + if (!map) + break; + node = &map->rb_node; + /* + * Drop the lookup reference since we are holding the + * lock in write mode and no one can remove the chunk + * map from the tree and drop its tree reference. + */ + btrfs_free_chunk_map(map); + } else { + node = next; + } + } write_unlock(&fs_info->mapping_tree_lock); } From f8e9f4a76df65222764c947a1b166ceebb1256dd Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 31 Jul 2024 12:41:06 -0700 Subject: [PATCH 054/110] btrfs: add comment about locking in cow_file_range_inline() Add a comment to document the complicated locked_page unlock logic in cow_file_range_inline. The specifically tricky part is that a caller just up the stack converts ret == 0 to ret == 1 and then another caller far up the callstack handles ret == 1 as a success, AND returns without cleanup in that case, both of which "feel" unnatural and led to the original bug. Try to document that somewhat specific callstack logic here to explain the weird un-setting of locked_folio on success. Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9656e5529fb..d25aeb844a64 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -744,6 +744,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, return ret; } + /* + * In the successful case (ret == 0 here), cow_file_range will return 1. + * + * Quite a bit further up the callstack in __extent_writepage, ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * In the failure case, the locked_folio does get unlocked by + * btrfs_folio_end_all_writers, which asserts that it is still locked + * at that point, so we must *not* unlock it here. + * + * The other two callsites in compress_file_range do not have a + * locked_folio, so they are not relevant to this logic. + */ if (ret == 0) locked_folio = NULL; From f8428360c8f9eca3bf355cdf10b789f880d61b47 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:03 +0200 Subject: [PATCH 055/110] btrfs: don't dump stripe-tree on lookup error This just creates unnecessary noise and doesn't provide any insights into debugging RAID stripe-tree related issues. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index bd06ff795691..c1c74f310e8b 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -284,8 +284,6 @@ out: if (ret > 0) ret = -ENOENT; if (ret && ret != -EIO && !stripe->is_scrub) { - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) - btrfs_print_tree(leaf, 1); btrfs_err(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, From d6106f0dc502c8ec375d6612418f7aa0e3e7d2b7 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:04 +0200 Subject: [PATCH 056/110] btrfs: rename btrfs_io_stripe::is_scrub to rst_search_commit_root Rename 'btrfs_io_stripe::is_scrub' to 'rst_search_commit_root'. While 'is_scrub' describes the state of the io_stripe (it is a stripe submitted by scrub) it does not describe the purpose, namely looking at the commit root when searching RAID stripe-tree entries. Renaming the stripe to rst_search_commit_root describes this purpose. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/raid-stripe-tree.c | 4 ++-- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b4e31ae17cd9..36d0e52faeec 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -678,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.is_scrub = !bbio->inode; + smap.rst_search_commit_root = !bbio->inode; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index c1c74f310e8b..28a545367be7 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -210,7 +210,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, if (!path) return -ENOMEM; - if (stripe->is_scrub) { + if (stripe->rst_search_commit_root) { path->skip_locking = 1; path->search_commit_root = 1; } @@ -283,7 +283,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, out: if (ret > 0) ret = -ENOENT; - if (ret && ret != -EIO && !stripe->is_scrub) { + if (ret && ret != -EIO && !stripe->rst_search_commit_root) { btrfs_err(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0de9162ff481..b3afa6365823 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1694,7 +1694,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, (i << fs_info->sectorsize_bits); int err; - io_stripe.is_scrub = true; + io_stripe.rst_search_commit_root = true; stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits; /* * For RST cases, we need to manually split the bbio to diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c947187539dd..03d2d60afe0c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -444,7 +444,7 @@ struct btrfs_io_stripe { /* Block mapping. */ u64 physical; u64 length; - bool is_scrub; + bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; }; From f4d39cf1cebfb83f76ffdd632958248aff364e33 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:05 +0200 Subject: [PATCH 057/110] btrfs: set search_commit_root on stripe io in case of relocation Set rst_search_commit_root in the btrfs_io_stripe we're passing to btrfs_map_block() in case we're doing data relocation. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/bio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 36d0e52faeec..f6cb58d7f16a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -678,7 +678,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; - smap.rst_search_commit_root = !bbio->inode; + if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + smap.rst_search_commit_root = true; + else + smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, From 04915240e2c3a018e4c7f23418478d27226c8957 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:06 +0200 Subject: [PATCH 058/110] btrfs: don't readahead the relocation inode on RST On relocation we're doing readahead on the relocation inode, but if the filesystem is backed by a RAID stripe tree we can get ENOENT (e.g. due to preallocated extents not being mapped in the RST) from the lookup. But readahead doesn't handle the error and submits invalid reads to the device, causing an assertion in the scatter-gather list code: BTRFS info (device nvme1n1): balance: start -d -m -s BTRFS info (device nvme1n1): relocating block group 6480920576 flags data|raid0 BTRFS error (device nvme1n1): cannot find raid-stripe for logical [6481928192, 6481969152] devid 2, profile raid0 ------------[ cut here ]------------ kernel BUG at include/linux/scatterlist.h:115! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1012 Comm: btrfs Not tainted 6.10.0-rc7+ #567 RIP: 0010:__blk_rq_map_sg+0x339/0x4a0 RSP: 0018:ffffc90001a43820 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffea00045d4802 RDX: 0000000117520000 RSI: 0000000000000000 RDI: ffff8881027d1000 RBP: 0000000000003000 R08: ffffea00045d4902 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000001000 R12: ffff8881003d10b8 R13: ffffc90001a438f0 R14: 0000000000000000 R15: 0000000000003000 FS: 00007fcc048a6900(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000002cd11000 CR3: 00000001109ea001 CR4: 0000000000370eb0 Call Trace: ? __die_body.cold+0x14/0x25 ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x65/0x80 ? __blk_rq_map_sg+0x339/0x4a0 ? exc_invalid_op+0x50/0x70 ? __blk_rq_map_sg+0x339/0x4a0 ? asm_exc_invalid_op+0x1a/0x20 ? __blk_rq_map_sg+0x339/0x4a0 nvme_prep_rq.part.0+0x9d/0x770 nvme_queue_rq+0x7d/0x1e0 __blk_mq_issue_directly+0x2a/0x90 ? blk_mq_get_budget_and_tag+0x61/0x90 blk_mq_try_issue_list_directly+0x56/0xf0 blk_mq_flush_plug_list.part.0+0x52b/0x5d0 __blk_flush_plug+0xc6/0x110 blk_finish_plug+0x28/0x40 read_pages+0x160/0x1c0 page_cache_ra_unbounded+0x109/0x180 relocate_file_extent_cluster+0x611/0x6a0 ? btrfs_search_slot+0xba4/0xd20 ? balance_dirty_pages_ratelimited_flags+0x26/0xb00 relocate_data_extent.constprop.0+0x134/0x160 relocate_block_group+0x3f2/0x500 btrfs_relocate_block_group+0x250/0x430 btrfs_relocate_chunk+0x3f/0x130 btrfs_balance+0x71b/0xef0 ? kmalloc_trace_noprof+0x13b/0x280 btrfs_ioctl+0x2c2e/0x3030 ? kvfree_call_rcu+0x1e6/0x340 ? list_lru_add_obj+0x66/0x80 ? mntput_no_expire+0x3a/0x220 __x64_sys_ioctl+0x96/0xc0 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fcc04514f9b Code: Unable to access opcode bytes at 0x7fcc04514f71. RSP: 002b:00007ffeba923370 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fcc04514f9b RDX: 00007ffeba923460 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000013 R09: 0000000000000001 R10: 00007fcc043fbba8 R11: 0000000000000246 R12: 00007ffeba924fc5 R13: 00007ffeba923460 R14: 0000000000000002 R15: 00000000004d4bb0 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:__blk_rq_map_sg+0x339/0x4a0 RSP: 0018:ffffc90001a43820 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffea00045d4802 RDX: 0000000117520000 RSI: 0000000000000000 RDI: ffff8881027d1000 RBP: 0000000000003000 R08: ffffea00045d4902 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000001000 R12: ffff8881003d10b8 R13: ffffc90001a438f0 R14: 0000000000000000 R15: 0000000000003000 FS: 00007fcc048a6900(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fcc04514f71 CR3: 00000001109ea001 CR4: 0000000000370eb0 Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception ]--- So in case of a relocation on a RAID stripe-tree based file system, skip the readahead. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0533d0f82dc9..ea4ed85919ec 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -36,6 +36,7 @@ #include "relocation.h" #include "super.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" /* * Relocation overview @@ -2965,21 +2966,34 @@ static int relocate_one_folio(struct reloc_control *rc, u64 folio_end; u64 cur; int ret; + const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); ASSERT(index <= last_index); folio = filemap_lock_folio(inode->i_mapping, index); if (IS_ERR(folio)) { - page_cache_sync_readahead(inode->i_mapping, ra, NULL, - index, last_index + 1 - index); + + /* + * On relocation we're doing readahead on the relocation inode, + * but if the filesystem is backed by a RAID stripe tree we can + * get ENOENT (e.g. due to preallocated extents not being + * mapped in the RST) from the lookup. + * + * But readahead doesn't handle the error and submits invalid + * reads to the device, causing a assertion failures. + */ + if (!use_rst) + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + index, last_index + 1 - index); folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); if (IS_ERR(folio)) return PTR_ERR(folio); } WARN_ON(folio_order(folio)); - if (folio_test_readahead(folio)) + if (folio_test_readahead(folio) && !use_rst) page_cache_async_readahead(inode->i_mapping, ra, NULL, folio, last_index + 1 - index); From 0c749585fc522b6cb32111abf2cd8f17cf30d3c5 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 Jul 2024 22:43:07 +0200 Subject: [PATCH 059/110] btrfs: change RST lookup error message level to debug Now that RAID stripe-tree lookup failures are not treated as a fatal issue any more, change the RAID stripe-tree lookup error message to debug level. Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid-stripe-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 28a545367be7..4c859b550f6c 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -284,7 +284,7 @@ out: if (ret > 0) ret = -ENOENT; if (ret && ret != -EIO && !stripe->rst_search_commit_root) { - btrfs_err(fs_info, + btrfs_debug(fs_info, "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", logical, logical + *length, stripe->dev->devid, btrfs_bg_type_to_raid_name(map_type)); From efffb803bf37d4514e025e3d59e067dabb59bcd1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 5 Aug 2024 15:02:54 +0930 Subject: [PATCH 060/110] btrfs: make btrfs_is_subpage() to return false directly for 4K page size Btrfs only supports sectorsize 4K, 8K, 16K, 32K, 64K for now, thus for systems with 4K page size, there is no way the fs is subpage (sectorsize < PAGE_SIZE). So here we define btrfs_is_subpage() different according to the PAGE_SIZE: - PAGE_SIZE > 4K We may hit real subpage cases, define btrfs_is_subpage() as a regular function and do the usual checks. - PAGE_SIZE == 4K (no smaller PAGE_SIZE support AFAIK) There is no way the fs is subpage, so just define btrfs_is_subpage() as an inline function which always return false. This saves about 7K bytes for x86_64 debug builds: text data bss dec hex filename Before: 1484452 168693 25776 1678921 199e49 fs/btrfs/btrfs.ko After: 1476605 168445 25776 1670826 197eaa fs/btrfs/btrfs.ko Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 ++ fs/btrfs/subpage.h | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 8ddd5fcbeb93..631d96f1e905 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,6 +64,7 @@ * This means a slightly higher tree locking latency. */ +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) @@ -85,6 +86,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space return true; return false; } +#endif void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 249396e118d0..5532cc4fac50 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -5,6 +5,7 @@ #include #include +#include struct address_space; struct folio; @@ -88,7 +89,15 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +#if PAGE_SIZE > SZ_4K bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#else +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct address_space *mapping) +{ + return false; +} +#endif void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, From 6d752cacae5eb8590c98866effaebf67410a9136 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 7 Aug 2024 17:13:58 +0100 Subject: [PATCH 061/110] btrfs: directly wake up cleaner kthread in the BTRFS_IOC_SYNC ioctl The BTRFS_IOC_SYNC ioctl wants to wake up the cleaner kthread so that it does any pending work (subvolume deletion, delayed iputs, etc), however it is waking up the transaction kthread, which in turn wakes up the cleaner. Since we don't have any transaction to commit, as any ongoing transaction was already committed when it called btrfs_sync_fs() and the goal is just to wake up the cleaner thread, directly wake up the cleaner instead of the transaction kthread. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e0a664b8a46a..ee01cc828883 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4765,11 +4765,10 @@ long btrfs_ioctl(struct file *file, unsigned int return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* - * The transaction thread may want to do more work, - * namely it pokes the cleaner kthread that will start - * processing uncleaned subvols. + * There may be work for the cleaner kthread to do (subvolume + * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ - wake_up_process(fs_info->transaction_kthread); + wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: From 0ae653fbec2b9fbc72c65a0c99528990bfb2136d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 13 Aug 2024 13:36:40 +0200 Subject: [PATCH 062/110] btrfs: reduce chunk_map lookups in btrfs_map_block() Currently we're calling btrfs_num_copies() before btrfs_get_chunk_map() in btrfs_map_block(). But btrfs_num_copies() itself does a chunk map lookup to be able to calculate the number of copies. So split out the code getting the number of copies from btrfs_num_copies() into a helper called btrfs_chunk_map_num_copies() and directly call it from btrfs_map_block() and btrfs_num_copies(). This saves us one rbtree lookup per btrfs_map_block() invocation. Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 49 +++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e07452207426..8f340ad1d938 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5781,11 +5781,31 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) write_unlock(&fs_info->mapping_tree_lock); } +static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type); + + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + + /* + * There could be two corrupted data stripes, we need to loop retry in + * order to rebuild the correct data. + * + * Fail a stripe at a time on every retry except the stripe under + * reconstruction. + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return map->num_stripes; + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + return btrfs_raid_array[index].ncopies; +} + int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct btrfs_chunk_map *map; - enum btrfs_raid_types index; - int ret = 1; + int ret; map = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(map)) @@ -5797,22 +5817,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - index = btrfs_bg_flags_to_raid_index(map->type); - - /* Non-RAID56, use their ncopies from btrfs_raid_array. */ - if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) - ret = btrfs_raid_array[index].ncopies; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - /* - * There could be two corrupted data stripes, we need - * to loop retry in order to rebuild the correct data. - * - * Fail a stripe at a time on every retry except the - * stripe under reconstruction. - */ - ret = map->num_stripes; + ret = btrfs_chunk_map_num_copies(map); btrfs_free_chunk_map(map); return ret; } @@ -6462,14 +6467,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom.stripe_index = 0; io_geom.op = op; - num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (io_geom.mirror_num > num_copies) - return -EINVAL; - map = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(map)) return PTR_ERR(map); + num_copies = btrfs_chunk_map_num_copies(map); + if (io_geom.mirror_num > num_copies) + return -EINVAL; + map_offset = logical - map->start; io_geom.raid56_full_stripe_start = (u64)-1; max_len = btrfs_max_io_len(map, map_offset, &io_geom); From 77b0b98bb743f5d04d8f995ba1936e1143689d4a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 30 Aug 2024 16:35:48 +0930 Subject: [PATCH 063/110] btrfs: subpage: fix the bitmap dump which can cause bitmap corruption In commit 75258f20fb70 ("btrfs: subpage: dump extra subpage bitmaps for debug") an internal macro GET_SUBPAGE_BITMAP() is introduced to grab the bitmap of each attribute. But that commit is using bitmap_cut() which will do the left shift of the larger bitmap, causing incorrect values. Thankfully this bitmap_cut() is only called for debug usage, and so far it's not yet causing problem. Fix it to use bitmap_read() to only grab the desired sub-bitmap. Fixes: 75258f20fb70 ("btrfs: subpage: dump extra subpage bitmaps for debug") CC: stable@vger.kernel.org # 6.6+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 631d96f1e905..f8795c3d2270 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -902,8 +902,14 @@ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct fol } #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ - bitmap_cut(dst, subpage->bitmaps, 0, \ - subpage_info->name##_offset, subpage_info->bitmap_nr_bits) +{ \ + const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \ + \ + ASSERT(bitmap_nr_bits < BITS_PER_LONG); \ + *dst = bitmap_read(subpage->bitmaps, \ + subpage_info->name##_offset, \ + bitmap_nr_bits); \ +} void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) From 8189197425e79f65281938ef29015ebfcf5deaa3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 7 Aug 2024 14:31:54 +0930 Subject: [PATCH 064/110] btrfs: refactor __extent_writepage_io() to do sector-by-sector submission Unlike the bitmap usage inside raid56, for __extent_writepage_io() we handle the subpage submission not sector-by-sector, but for each dirty range we found. This is not a big deal normally, as the subpage complex code is already mostly optimized out by the compiler for x86_64. However for the sake of consistency and for the future of subpage sector-perfect compression support, this patch does: - Extract the sector submission code into submit_one_sector() - Add the needed code to extract the dirty bitmap for subpage case There is a small pitfall for non-subpage case, as we cleared page dirty before starting writeback, so we have to manually set the default dirty_bitmap to 1 for such case. - Use bitmap_and() to calculate the target sectors we need to submit This is done for both subpage and non-subpage cases, and will later be expanded to skip inline/compression ranges. For x86_64, the dirty bitmap will be fixed to 1, with the length of 1, so we're still doing the same workload per sector. For larger page sizes, the overhead will be a little larger, as previous we only need to do one extent_map lookup per-dirty-range, but now it will be one extent_map lookup per-sector. But that is the same frequency as x86_64, so we're just aligning the behavior to x86_64. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 202 +++++++++++++++++++------------------------ fs/btrfs/subpage.c | 17 ++++ fs/btrfs/subpage.h | 3 + 3 files changed, 109 insertions(+), 113 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e56b62746a15..822e2bf8bc99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1333,56 +1333,68 @@ out: } /* - * Find the first byte we need to write. + * Return 0 if we have submitted or queued the sector for submission. + * Return <0 for critical errors. * - * For subpage, one page can contain several sectors, and - * __extent_writepage_io() will just grab all extent maps in the page - * range and try to submit all non-inline/non-compressed extents. - * - * This is a big problem for subpage, we shouldn't re-submit already written - * data at all. - * This function will lookup subpage dirty bit to find which range we really - * need to submit. - * - * Return the next dirty range in [@start, @end). - * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ -static void find_next_dirty_byte(const struct btrfs_fs_info *fs_info, - struct folio *folio, u64 *start, u64 *end) +static int submit_one_sector(struct btrfs_inode *inode, + struct folio *folio, + u64 filepos, struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { - struct btrfs_subpage *subpage = folio_get_private(folio); - struct btrfs_subpage_info *spi = fs_info->subpage_info; - u64 orig_start = *start; - /* Declare as unsigned long so we can use bitmap ops */ - unsigned long flags; - int range_start_bit; - int range_end_bit; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map *em; + u64 block_start; + u64 disk_bytenr; + u64 extent_offset; + u64 em_end; + const u32 sectorsize = fs_info->sectorsize; + + ASSERT(IS_ALIGNED(filepos, sectorsize)); + + /* @filepos >= i_size case should be handled by the caller. */ + ASSERT(filepos < i_size); + + em = btrfs_get_extent(inode, NULL, filepos, sectorsize); + if (IS_ERR(em)) + return PTR_ERR_OR_ZERO(em); + + extent_offset = filepos - em->start; + em_end = extent_map_end(em); + ASSERT(filepos <= em_end); + ASSERT(IS_ALIGNED(em->start, sectorsize)); + ASSERT(IS_ALIGNED(em->len, sectorsize)); + + block_start = extent_map_block_start(em); + disk_bytenr = extent_map_block_start(em) + extent_offset; + + ASSERT(!extent_map_is_compressed(em)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + + free_extent_map(em); + em = NULL; + + btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); + /* + * Above call should set the whole folio with writeback flag, even + * just for a single subpage sector. + * As long as the folio is properly locked and the range is correct, + * we should always get the folio with writeback flag. + */ + ASSERT(folio_test_writeback(folio)); /* - * For regular sector size == page size case, since one page only - * contains one sector, we return the page offset directly. + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * folio for range already written to disk. */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - *start = folio_pos(folio); - *end = folio_pos(folio) + folio_size(folio); - return; - } - - range_start_bit = spi->dirty_offset + - (offset_in_folio(folio, orig_start) >> - fs_info->sectorsize_bits); - - /* We should have the page locked, but just in case */ - spin_lock_irqsave(&subpage->lock, flags); - bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, - spi->dirty_offset + spi->bitmap_nr_bits); - spin_unlock_irqrestore(&subpage->lock, flags); - - range_start_bit -= spi->dirty_offset; - range_end_bit -= spi->dirty_offset; - - *start = folio_pos(folio) + range_start_bit * fs_info->sectorsize; - *end = folio_pos(folio) + range_end_bit * fs_info->sectorsize; + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + submit_extent_folio(bio_ctrl, disk_bytenr, folio, + sectorsize, filepos - folio_pos(folio)); + return 0; } /* @@ -1400,16 +1412,24 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, loff_t i_size, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 cur = start; - u64 end = start + len - 1; - u64 extent_offset; - u64 block_start; - struct extent_map *em; + unsigned long range_bitmap = 0; + /* + * This is the default value for sectorsize == PAGE_SIZE case. + * We known we need to write the dirty sector (aka the page), + * even if the page is not dirty (we cleared it before entering). + * + * For subpage cases we will get the correct bitmap later. + */ + unsigned long dirty_bitmap = 1; + unsigned int bitmap_size = 1; + const u64 folio_start = folio_pos(folio); + u64 cur; + int bit; int ret = 0; int nr = 0; - ASSERT(start >= folio_pos(folio) && - start + len <= folio_pos(folio) + folio_size(folio)); + ASSERT(start >= folio_start && + start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); if (ret) { @@ -1419,18 +1439,23 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->subpage_info); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); + bitmap_size = fs_info->subpage_info->bitmap_nr_bits; + } + for (cur = start; cur < start + len; cur += fs_info->sectorsize) + set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); + bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size); + bio_ctrl->end_io_func = end_bbio_data_write; - while (cur <= end) { - u32 len = end - cur + 1; - u64 disk_bytenr; - u64 em_end; - u64 dirty_range_start = cur; - u64 dirty_range_end; - u32 iosize; + + for_each_set_bit(bit, &dirty_bitmap, bitmap_size) { + cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { - btrfs_mark_ordered_io_finished(inode, folio, cur, len, - true); + btrfs_mark_ordered_io_finished(inode, folio, cur, + start + len - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1439,62 +1464,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, folio, cur, len); + btrfs_folio_clear_dirty(fs_info, folio, cur, + start + len - cur); break; } - - find_next_dirty_byte(fs_info, folio, &dirty_range_start, - &dirty_range_end); - if (cur < dirty_range_start) { - cur = dirty_range_start; - continue; - } - - em = btrfs_get_extent(inode, NULL, cur, len); - if (IS_ERR(em)) { - ret = PTR_ERR_OR_ZERO(em); + ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); + if (ret < 0) goto out_error; - } - - extent_offset = cur - em->start; - em_end = extent_map_end(em); - ASSERT(cur <= em_end); - ASSERT(cur < end); - ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); - ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); - - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; - - ASSERT(!extent_map_is_compressed(em)); - ASSERT(block_start != EXTENT_MAP_HOLE); - ASSERT(block_start != EXTENT_MAP_INLINE); - - /* - * Note that em_end from extent_map_end() and dirty_range_end from - * find_next_dirty_byte() are all exclusive - */ - iosize = min(min(em_end, end + 1), dirty_range_end) - cur; - free_extent_map(em); - em = NULL; - - /* - * Although the PageDirty bit is cleared before entering this - * function, subpage dirty bit is not cleared. - * So clear subpage dirty bit here so next time we won't submit - * folio for range already written to disk. - */ - btrfs_folio_clear_dirty(fs_info, folio, cur, iosize); - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); - if (!folio_test_writeback(folio)) { - btrfs_err(inode->root->fs_info, - "folio %lu not writeback, cur %llu end %llu", - folio->index, cur, end); - } - - submit_extent_folio(bio_ctrl, disk_bytenr, folio, - iosize, cur - folio_pos(folio)); - cur += iosize; nr++; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index f8795c3d2270..84a9953e32f3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -946,3 +946,20 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, subpage_info->bitmap_nr_bits, &ordered_bitmap, subpage_info->bitmap_nr_bits, &checked_bitmap); } + +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long flags; + + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + ASSERT(subpage_info); + subpage = folio_get_private(folio); + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, ret_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5532cc4fac50..eee55e5a3952 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -175,6 +175,9 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, + struct folio *folio, + unsigned long *ret_bitmap); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); From 14ed830d10322007565af3a0da39948f229a72d6 Mon Sep 17 00:00:00 2001 From: Junchao Sun Date: Fri, 7 Jun 2024 22:30:20 +0800 Subject: [PATCH 065/110] btrfs: qgroup: use goto style to handle errors in add_delayed_ref() Clean up resources using goto to get rid of repeated code. Reviewed-by: Qu Wenruo Signed-off-by: Junchao Sun Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 06a9e0542d70..0bfa014b796d 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1005,18 +1005,13 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - return -ENOMEM; - } + if (!head_ref) + goto free_node; if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_ref_node_cachep, node); - kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); - return -ENOMEM; - } + if (!record) + goto free_head_ref; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1052,6 +1047,12 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_node: + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); + return -ENOMEM; } /* From 3cce39a8ca4e7565b6ac1fbcf858171bf07c3757 Mon Sep 17 00:00:00 2001 From: Junchao Sun Date: Fri, 7 Jun 2024 22:30:21 +0800 Subject: [PATCH 066/110] btrfs: qgroup: use xarray to track dirty extents in transaction Use xarray to track dirty extents to reduce the size of the struct btrfs_qgroup_extent_record from 64 bytes to 40 bytes. The xarray is more cache line friendly, it also reduces the complexity of insertion and search code compared to rb tree. Another change introduced is about error handling. Before this patch, the result of btrfs_qgroup_trace_extent_nolock() is always a success. In this patch, because of this function calls the function xa_store() which has the possibility to fail, so mark qgroup as inconsistent if error happened and then free preallocated memory. Also we preallocate memory before spin_lock(), if memory preallcation failed, error handling is the same the existing code. Suggested-by: Qu Wenruo Signed-off-by: Junchao Sun Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 17 +++++++++-- fs/btrfs/delayed-ref.h | 4 +-- fs/btrfs/qgroup.c | 66 ++++++++++++++++++++---------------------- fs/btrfs/qgroup.h | 1 - fs/btrfs/transaction.c | 5 ++-- 5 files changed, 50 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0bfa014b796d..ad9ef8312e41 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -855,11 +855,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { - if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, - delayed_refs, qrecord)) + int ret; + + ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, + delayed_refs, qrecord); + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); kfree(qrecord); - else + } else { qrecord_inserted = true; + } } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -1012,6 +1018,9 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) goto free_head_ref; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, + generic_ref->bytenr, GFP_NOFS)) + goto free_record; } init_delayed_ref_common(fs_info, node, generic_ref); @@ -1048,6 +1057,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, return btrfs_qgroup_trace_extent_post(trans, record); return 0; +free_record: + kfree(record); free_head_ref: kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); free_node: diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 05f634eb472d..085f30968aba 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -202,8 +202,8 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; - /* dirty extent records */ - struct rb_root dirty_extent_root; + /* Track dirty extent records. */ + struct xarray dirty_extents; /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index feb8f9f2f358..c297909f1506 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1998,16 +1998,14 @@ out: * * Return 0 for success insert * Return >0 for existing record, caller can free @record safely. - * Error is not possible + * Return <0 for insertion failure, caller can free @record safely. */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { - struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_qgroup_extent_record *entry; - u64 bytenr = record->bytenr; + struct btrfs_qgroup_extent_record *existing, *ret; + unsigned long bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) return 1; @@ -2015,26 +2013,24 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, - node); - if (bytenr < entry->bytenr) { - p = &(*p)->rb_left; - } else if (bytenr > entry->bytenr) { - p = &(*p)->rb_right; - } else { - if (record->data_rsv && !entry->data_rsv) { - entry->data_rsv = record->data_rsv; - entry->data_rsv_refroot = - record->data_rsv_refroot; - } - return 1; + xa_lock(&delayed_refs->dirty_extents); + existing = xa_load(&delayed_refs->dirty_extents, bytenr); + if (existing) { + if (record->data_rsv && !existing->data_rsv) { + existing->data_rsv = record->data_rsv; + existing->data_rsv_refroot = record->data_rsv_refroot; } + xa_unlock(&delayed_refs->dirty_extents); + return 1; + } + + ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC); + xa_unlock(&delayed_refs->dirty_extents); + if (xa_is_err(ret)) { + qgroup_mark_inconsistent(fs_info); + return xa_err(ret); } - rb_link_node(&record->node, parent_node, p); - rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); return 0; } @@ -2141,6 +2137,11 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!record) return -ENOMEM; + if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) { + kfree(record); + return -ENOMEM; + } + delayed_refs = &trans->transaction->delayed_refs; record->bytenr = bytenr; record->num_bytes = num_bytes; @@ -2149,7 +2150,9 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); - if (ret > 0) { + if (ret) { + /* Clean up if insertion fails or item exists. */ + xa_release(&delayed_refs->dirty_extents, record->bytenr); kfree(record); return 0; } @@ -3018,7 +3021,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) struct btrfs_qgroup_extent_record *record; struct btrfs_delayed_ref_root *delayed_refs; struct ulist *new_roots = NULL; - struct rb_node *node; + unsigned long index; u64 num_dirty_extents = 0; u64 qgroup_to_skip; int ret = 0; @@ -3028,10 +3031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; - while ((node = rb_first(&delayed_refs->dirty_extent_root))) { - record = rb_entry(node, struct btrfs_qgroup_extent_record, - node); - + xa_for_each(&delayed_refs->dirty_extents, index, record) { num_dirty_extents++; trace_btrfs_qgroup_account_extents(fs_info, record); @@ -3097,7 +3097,7 @@ cleanup: ulist_free(record->old_roots); ulist_free(new_roots); new_roots = NULL; - rb_erase(node, &delayed_refs->dirty_extent_root); + xa_erase(&delayed_refs->dirty_extents, index); kfree(record); } @@ -4874,15 +4874,13 @@ out: void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) { struct btrfs_qgroup_extent_record *entry; - struct btrfs_qgroup_extent_record *next; - struct rb_root *root; + unsigned long index; - root = &trans->delayed_refs.dirty_extent_root; - rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + xa_for_each(&trans->delayed_refs.dirty_extents, index, entry) { ulist_free(entry->old_roots); kfree(entry); } - *root = RB_ROOT; + xa_destroy(&trans->delayed_refs.dirty_extents); } void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index deb479d176a9..98adf4ec7b01 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -125,7 +125,6 @@ struct btrfs_inode; * Record a dirty extent, and info qgroup to update quota on it */ struct btrfs_qgroup_extent_record { - struct rb_node node; u64 bytenr; u64 num_bytes; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5e6fff8e1003..0fc873af891f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -143,8 +143,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); - WARN_ON(!RB_EMPTY_ROOT( - &transaction->delayed_refs.dirty_extent_root)); + WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", @@ -351,7 +350,7 @@ loop: memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; - cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; + xa_init(&cur_trans->delayed_refs.dirty_extents); atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* From e39ba5dfd0b57905fe002da9f80649b6ab388134 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 14 Aug 2024 10:13:29 +0200 Subject: [PATCH 067/110] btrfs: send: fix grammar in comments Fix a few obvious grammar mistakes: a -> an, then -> than. Signed-off-by: Thorsten Blum Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 619fa0b8b3f6..7f48ba6c1c77 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -62,7 +62,7 @@ struct fs_path { /* * Average path length does not exceed 200 bytes, we'll have * better packing in the slab and higher chance to satisfy - * a allocation later during send. + * an allocation later during send. */ char pad[256]; }; @@ -1136,7 +1136,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Start with a small buffer (1 page). If later we end up needing more * space, which can happen for xattrs on a fs with a leaf size greater - * then the page size, attempt to increase the buffer. Typically xattr + * than the page size, attempt to increase the buffer. Typically xattr * values are small. */ buf_len = PATH_MAX; From 2c70fe16ea0c0d3f08659fc9d75b4840711fee05 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Aug 2024 10:50:21 +0930 Subject: [PATCH 068/110] btrfs: remove the nr_ret parameter from __extent_writepage_io() The parameter @nr_ret is used to tell the caller how many sectors have been submitted for IO. Then callers check @nr_ret value to determine if we need to manually clear the PAGECACHE_TAG_DIRTY, as if we submitted no sector (e.g. all sectors are beyond i_size) there is no folio_start_writeback() called thus PAGECACHE_TAG_DIRTY tag will not be cleared. Remove this parameter by: - Moving the btrfs_folio_clear_writeback() call into __extent_writepage_io() So that if we didn't submit any IO, then manually call btrfs_folio_set_writeback() to clear PAGECACHE_TAG_DIRTY when the page is no longer dirty. - Use a bool to record if we have submitted any sector Instead of an int. - Use subpage compatible helpers to end folio writeback. This brings no change to the behavior, just for the sake of consistency. As for the call site inside __extent_writepage(), we're always called for the whole page, so the existing full page helper folio_(start|end)_writeback() is totally fine. For the call site inside extent_write_locked_range(), although we can have subpage range, folio_start_writeback() will only clear PAGECACHE_TAG_DIRTY if the page is no longer dirty, and the full folio will still be dirty if there is any subpage dirty range. Only when the last dirty subpage sector is cleared, the folio_start_writeback() will clear PAGECACHE_TAG_DIRTY. So no matter if we call the full page or subpage helper, the result is still the same, then just use the subpage helpers for consistency. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 822e2bf8bc99..6083bed89df2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1409,7 +1409,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct folio *folio, u64 start, u32 len, struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size, int *nr_ret) + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; @@ -1422,11 +1422,11 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ unsigned long dirty_bitmap = 1; unsigned int bitmap_size = 1; + bool submitted_io = false; const u64 folio_start = folio_pos(folio); u64 cur; int bit; int ret = 0; - int nr = 0; ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); @@ -1470,20 +1470,24 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); if (ret < 0) - goto out_error; - nr++; + goto out; + submitted_io = true; } btrfs_folio_assert_not_dirty(fs_info, folio, start, len); - *nr_ret = nr; - return 0; - -out_error: +out: /* - * If we finish without problem, we should not only clear folio dirty, - * but also empty subpage dirty bits + * If we didn't submitted any sector (>= i_size), folio dirty get + * cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared + * by folio_start_writeback() if the folio is not dirty). + * + * Here we set writeback and clear for the range. If the full folio + * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. */ - *nr_ret = nr; + if (!submitted_io) { + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); + } return ret; } @@ -1501,7 +1505,6 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct struct inode *inode = folio->mapping->host; const u64 page_start = folio_pos(folio); int ret; - int nr = 0; size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; @@ -1532,18 +1535,13 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct goto done; ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size, &nr); + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; bio_ctrl->wbc->nr_to_write--; done: - if (nr == 0) { - /* make sure the mapping tag for page dirty gets cleared */ - folio_start_writeback(folio); - folio_end_writeback(folio); - } if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, page_start, PAGE_SIZE, !ret); @@ -2276,7 +2274,6 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); u32 cur_len = cur_end + 1 - cur; struct folio *folio; - int nr = 0; folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); @@ -2297,15 +2294,10 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(folio_test_dirty(folio)); ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, - &bio_ctrl, i_size, &nr); + &bio_ctrl, i_size); if (ret == 1) goto next_page; - /* Make sure the mapping tag for page dirty gets cleared. */ - if (nr == 0) { - btrfs_folio_set_writeback(fs_info, folio, cur, cur_len); - btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len); - } if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio, cur, cur_len, !ret); From ce4a71ee157e810ec28ad44e8148dfc55e77d4a0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 26 Aug 2024 15:44:50 +0930 Subject: [PATCH 069/110] btrfs: subpage: remove btrfs_fs_info::subpage_info member The member btrfs_fs_info::subpage_info stores the cached bitmap start position inside the merged bitmap. However in reality there is only one thing depending on the sectorsize, bitmap_nr_bits, which records the number of sectors that fit inside a page. The sequence of sub-bitmaps have fixed order, thus it's just a quick multiplication to calculate the start position of each sub-bitmaps. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +------ fs/btrfs/extent_io.c | 8 ++-- fs/btrfs/fs.h | 2 +- fs/btrfs/subpage.c | 93 ++++++++++++++------------------------------ fs/btrfs/subpage.h | 43 +++++++------------- 5 files changed, 50 insertions(+), 110 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6f5441e62d1..612460e07b2e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1285,7 +1285,6 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); - kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -3322,6 +3321,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; @@ -3346,20 +3346,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) { - struct btrfs_subpage_info *subpage_info; - + if (sectorsize < PAGE_SIZE) btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); - if (!subpage_info) { - ret = -ENOMEM; - goto fail_alloc; - } - btrfs_init_subpage_info(subpage_info, sectorsize); - fs_info->subpage_info = subpage_info; - } ret = btrfs_init_workqueues(fs_info); if (ret) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6083bed89df2..643dd948054f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1440,9 +1440,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->subpage_info); + ASSERT(fs_info->sectors_per_page > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); - bitmap_size = fs_info->subpage_info->bitmap_nr_bits; + bitmap_size = fs_info->sectors_per_page; } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); @@ -1827,7 +1827,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { + while (bit_start < fs_info->sectors_per_page) { struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; @@ -1843,7 +1843,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) break; } spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->i_private_lock); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3d6d4b503220..79f64e383edd 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -703,8 +703,8 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; + u32 sectors_per_page; struct workqueue_struct *scrub_workers; - struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 84a9953e32f3..1dda17b5ab12 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -88,37 +88,6 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space } #endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) -{ - unsigned int cur = 0; - unsigned int nr_bits; - - ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); - - nr_bits = PAGE_SIZE / sectorsize; - subpage_info->bitmap_nr_bits = nr_bits; - - subpage_info->uptodate_offset = cur; - cur += nr_bits; - - subpage_info->dirty_offset = cur; - cur += nr_bits; - - subpage_info->writeback_offset = cur; - cur += nr_bits; - - subpage_info->ordered_offset = cur; - cur += nr_bits; - - subpage_info->checked_offset = cur; - cur += nr_bits; - - subpage_info->locked_offset = cur; - cur += nr_bits; - - subpage_info->total_nr_bits = cur; -} - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { @@ -165,7 +134,7 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, ASSERT(fs_info->sectorsize < PAGE_SIZE); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -248,7 +217,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->subpage_info->name##_offset; \ + __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -420,13 +389,13 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) #define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->subpage_info->name##_offset, \ - fs_info->subpage_info->bitmap_nr_bits) + fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ + fs_info->sectors_per_page) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -805,7 +774,7 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->writers); - ASSERT(ret <= fs_info->subpage_info->bitmap_nr_bits); + ASSERT(ret <= fs_info->sectors_per_page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -821,14 +790,13 @@ bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage = folio_get_private(folio); + const u32 sectors_per_page = fs_info->sectors_per_page; const unsigned int len = PAGE_SIZE - offset_in_page(search_start); const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, locked, search_start, len); - const unsigned int locked_bitmap_start = subpage_info->locked_offset; - const unsigned int locked_bitmap_end = locked_bitmap_start + - subpage_info->bitmap_nr_bits; + const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page; unsigned long flags; int first_zero; int first_set; @@ -901,21 +869,21 @@ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct fol } } -#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ - const int bitmap_nr_bits = subpage_info->bitmap_nr_bits; \ + const int sectors_per_page = fs_info->sectors_per_page; \ \ - ASSERT(bitmap_nr_bits < BITS_PER_LONG); \ + ASSERT(sectors_per_page < BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - subpage_info->name##_offset, \ - bitmap_nr_bits); \ + sectors_per_page * btrfs_bitmap_nr_##name, \ + sectors_per_page); \ } void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; + const u32 sectors_per_page = fs_info->sectors_per_page; unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -924,42 +892,41 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, locked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - subpage_info->bitmap_nr_bits, &uptodate_bitmap, - subpage_info->bitmap_nr_bits, &dirty_bitmap, - subpage_info->bitmap_nr_bits, &writeback_bitmap, - subpage_info->bitmap_nr_bits, &ordered_bitmap, - subpage_info->bitmap_nr_bits, &checked_bitmap); + sectors_per_page, &uptodate_bitmap, + sectors_per_page, &dirty_bitmap, + sectors_per_page, &writeback_bitmap, + sectors_per_page, &ordered_bitmap, + sectors_per_page, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap) { - struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(subpage_info); + ASSERT(fs_info->sectors_per_page > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index eee55e5a3952..b67cd5f6539d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,39 +19,23 @@ struct btrfs_fs_info; * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- dirty_offset /- ordered_offset + * /- uptodate /- dirty /- ordered * | | | * v v v * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| - * |<- bitmap_nr_bits ->| - * |<----------------- total_nr_bits ------------------>| + * |< sectors_per_page >| + * + * Unlike regular macro-like enums, here we do not go upper-case names, as + * these names will be utilized in various macros to define function names. */ -struct btrfs_subpage_info { - /* Number of bits for each bitmap */ - unsigned int bitmap_nr_bits; - - /* Total number of bits for the whole bitmap */ - unsigned int total_nr_bits; - - /* - * *_offset indicates where the bitmap starts, the length is always - * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. - */ - unsigned int uptodate_offset; - unsigned int dirty_offset; - unsigned int writeback_offset; - unsigned int ordered_offset; - unsigned int checked_offset; - - /* - * For locked bitmaps, normally it's subpage representation for folio - * Locked flag, but metadata is different: - * - * - Metadata doesn't really lock the folio - * It's just to prevent page::private get cleared before the last - * end_page_read(). - */ - unsigned int locked_offset; +enum { + btrfs_bitmap_nr_uptodate = 0, + btrfs_bitmap_nr_dirty, + btrfs_bitmap_nr_writeback, + btrfs_bitmap_nr_ordered, + btrfs_bitmap_nr_checked, + btrfs_bitmap_nr_locked, + btrfs_bitmap_nr_max }; /* @@ -99,7 +83,6 @@ static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, } #endif -void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); From 792e86ef31b91c98c529f8c4fb6aa14886584193 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:40:11 +0200 Subject: [PATCH 070/110] btrfs: rename btrfs_submit_bio() to btrfs_submit_bbio() The function name is a bit misleading as it submits the btrfs_bio (bbio), rename it so we can use btrfs_submit_bio() when an actual bio is submitted. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 +++++----- fs/btrfs/bio.h | 6 +++--- fs/btrfs/compression.c | 4 ++-- fs/btrfs/direct-io.c | 2 +- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/inode.c | 4 ++-- fs/btrfs/scrub.c | 10 +++++----- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index f6cb58d7f16a..4f3e265880bf 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -53,7 +53,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, /* * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * btrfs, and is used for all I/O submitted through btrfs_submit_bbio(). * * Just like the underlying bio_alloc_bioset it will not fail as it is backed by * a mempool. @@ -211,7 +211,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, goto done; } - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return; } @@ -280,7 +280,7 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); - btrfs_submit_bio(repair_bbio, mirror); + btrfs_submit_bbio(repair_bbio, mirror); return fbio; } @@ -777,7 +777,7 @@ fail: return true; } -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) { /* If bbio->inode is not populated, its file_offset must be 0. */ ASSERT(bbio->inode || bbio->file_offset == 0); @@ -789,7 +789,7 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) /* * Submit a repair write. * - * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a * RAID setup. Here we only want to write the one bad copy, so we do the * mapping ourselves and submit the bio directly. * diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index d9dd5276093d..e48612340745 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -29,7 +29,7 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* * Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and - * passed to btrfs_submit_bio for mapping to the physical devices. + * passed to btrfs_submit_bbio() for mapping to the physical devices. */ struct btrfs_bio { /* @@ -42,7 +42,7 @@ struct btrfs_bio { union { /* * For data reads: checksumming and original I/O information. - * (for internal use in the btrfs_submit_bio machinery only) + * (for internal use in the btrfs_submit_bbio() machinery only) */ struct { u8 *csum; @@ -104,7 +104,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE -void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct folio *folio, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 832ab8984c41..39cd2ed1974b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -395,7 +395,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->bbio.ordered = ordered; btrfs_add_compressed_bio_folios(cb); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); } /* @@ -630,7 +630,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, 0); + btrfs_submit_bbio(&cb->bbio, 0); return; out_free_compressed_pages: diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 364bce34f034..ea7f918b1c45 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -726,7 +726,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 643dd948054f..8de6d226475d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -117,7 +117,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -1800,7 +1800,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_unlock(folio); } } - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); } /* @@ -3572,7 +3572,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, ASSERT(ret); } } - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bbio(bbio, mirror_num); done: if (wait == WAIT_COMPLETE) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d25aeb844a64..27aa67f135ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9152,7 +9152,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); @@ -9167,7 +9167,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } while (disk_io_size); atomic_inc(&priv.pending); - btrfs_submit_bio(bbio, 0); + btrfs_submit_bbio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b3afa6365823..3a3427428074 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -838,7 +838,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); bbio = NULL; @@ -857,7 +857,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); if (wait) wait_scrub_stripe_io(stripe); } @@ -1683,7 +1683,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); bbio = NULL; } @@ -1720,7 +1720,7 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1776,7 +1776,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, mirror = calc_next_mirror(mirror, num_copies); } - btrfs_submit_bio(bbio, mirror); + btrfs_submit_bbio(bbio, mirror); } static bool stripe_has_metadata_error(struct scrub_stripe *stripe) From 22b4ef50dc1d11376f09dd8e9e7cf18ef5ead48f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:41:45 +0200 Subject: [PATCH 071/110] btrfs: rename __btrfs_submit_bio() and drop double underscores Previous patch freed the function name btrfs_submit_bio() so we can use it for a helper that submits struct bio. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4f3e265880bf..d5dcc356df33 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -502,8 +502,8 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } -static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, - struct btrfs_io_stripe *smap, int mirror_num) +static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) { if (!bioc) { /* Single mirror read/write fast path. */ @@ -603,7 +603,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) * context. This changes nothing when cgroups aren't in use. */ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; - __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } static bool should_async_write(struct btrfs_bio *bbio) @@ -752,7 +752,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) } } - __btrfs_submit_bio(bio, bioc, &smap, mirror_num); + btrfs_submit_bio(bio, bioc, &smap, mirror_num); done: return map_length == length; @@ -878,7 +878,7 @@ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_ ASSERT(smap.dev == fs_info->dev_replace.srcdev); smap.dev = fs_info->dev_replace.tgtdev; } - __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); + btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); return; fail: From 06de42c5a98a28060b314589241cabcacc3c4ff8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:30:16 +0200 Subject: [PATCH 072/110] btrfs: rename __extent_writepage() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 ++++++++++++++-------------- fs/btrfs/inode.c | 2 +- fs/btrfs/subpage.c | 4 ++-- include/trace/events/btrfs.h | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8de6d226475d..f7a388529c17 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1177,7 +1177,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) } /* - * helper for __extent_writepage, doing all of the delayed allocation setup. + * helper for extent_writepage(), doing all of the delayed allocation setup. * * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has @@ -1398,18 +1398,18 @@ static int submit_one_sector(struct btrfs_inode *inode, } /* - * helper for __extent_writepage. This calls the writepage start hooks, + * Helper for extent_writepage(). This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * * We return 1 if the IO is started and the page is unlocked, * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, - struct folio *folio, - u64 start, u32 len, - struct btrfs_bio_ctrl *bio_ctrl, - loff_t i_size) +static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, + struct folio *folio, + u64 start, u32 len, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; @@ -1500,7 +1500,7 @@ out: * Return 0 if everything goes well. * Return <0 for error. */ -static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) +static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; const u64 page_start = folio_pos(folio); @@ -1509,7 +1509,7 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - trace___extent_writepage(folio, inode, bio_ctrl->wbc); + trace_extent_writepage(folio, inode, bio_ctrl->wbc); WARN_ON(!folio_test_locked(folio)); @@ -1534,8 +1534,8 @@ static int __extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ct if (ret) goto done; - ret = __extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + ret = extent_writepage_io(BTRFS_I(inode), folio, folio_pos(folio), + PAGE_SIZE, bio_ctrl, i_size); if (ret == 1) return 0; @@ -2202,7 +2202,7 @@ retry: continue; } - ret = __extent_writepage(folio, bio_ctrl); + ret = extent_writepage(folio, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2293,8 +2293,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); - ret = __extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, - &bio_ctrl, i_size); + ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, + &bio_ctrl, i_size); if (ret == 1) goto next_page; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 27aa67f135ff..efe75b03d5f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -747,7 +747,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, /* * In the successful case (ret == 0 here), cow_file_range will return 1. * - * Quite a bit further up the callstack in __extent_writepage, ret == 1 + * Quite a bit further up the callstack in extent_writepage(), ret == 1 * is treated as a short circuited success and does not unlock the folio, * so we must do it here. * diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1dda17b5ab12..ca7d2aedfa8d 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -705,7 +705,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, * - Page locked by plain lock_page() * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). - * This is the most common locked page for __extent_writepage() called + * This is the most common locked page for extent_writepage() called * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * @@ -829,7 +829,7 @@ out: * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, * this ends all writer locked ranges of a page. * - * This is for the locked page of __extent_writepage(), as the locked page + * This is for the locked page of extent_writepage(), as the locked page * can contain several locked subpage ranges. */ void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0eddbb8b6728..e4add61e00f1 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -721,7 +721,7 @@ DECLARE_EVENT_CLASS(btrfs__writepage, __entry->writeback_index) ); -DEFINE_EVENT(btrfs__writepage, __extent_writepage, +DEFINE_EVENT(btrfs__writepage, extent_writepage, TP_PROTO(const struct folio *folio, const struct inode *inode, const struct writeback_control *wbc), From a92914a80b137a447688d868274871f16ec152d5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:44:15 +0200 Subject: [PATCH 073/110] btrfs: rename __compare_inode_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f6dbda37a361..e2949f630584 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,7 +45,7 @@ struct inode_defrag { u32 extent_thresh; }; -static int __compare_inode_defrag(struct inode_defrag *defrag1, +static int compare_inode_defrag(struct inode_defrag *defrag1, struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) @@ -83,7 +83,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(defrag, entry); + ret = compare_inode_defrag(defrag, entry); if (ret < 0) p = &parent->rb_left; else if (ret > 0) @@ -189,7 +189,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( parent = p; entry = rb_entry(parent, struct inode_defrag, rb_node); - ret = __compare_inode_defrag(&tmp, entry); + ret = compare_inode_defrag(&tmp, entry); if (ret < 0) p = parent->rb_left; else if (ret > 0) @@ -198,7 +198,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( goto out; } - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); if (parent) entry = rb_entry(parent, struct inode_defrag, rb_node); From b7164d9ab03137dc47faa970e25e6507f3c57590 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:44:30 +0200 Subject: [PATCH 074/110] btrfs: constify arguments of compare_inode_defrag() A comparator function does not change its parameters, make them const. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e2949f630584..e4bb5a8651f3 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -45,8 +45,8 @@ struct inode_defrag { u32 extent_thresh; }; -static int compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) +static int compare_inode_defrag(const struct inode_defrag *defrag1, + const struct inode_defrag *defrag2) { if (defrag1->root > defrag2->root) return 1; From 6d2f07e13c01b06c6e38117c83df78fea8e9d1a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:45:38 +0200 Subject: [PATCH 075/110] btrfs: rename __need_auto_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index e4bb5a8651f3..b735fce21f07 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -107,7 +107,7 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; @@ -130,7 +130,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, u64 transid; int ret; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) @@ -245,7 +245,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, again: if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) goto cleanup; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) goto cleanup; /* Get the inode */ @@ -306,7 +306,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info)) + if (!need_auto_defrag(fs_info)) break; /* find an inode to defrag */ From 42257569026182a877e6eaea70a64fd58842a79f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:50:43 +0200 Subject: [PATCH 076/110] btrfs: rename __btrfs_add_inode_defrag() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Also update the misleading comment, the passed item is not freed, that's what the caller does. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index b735fce21f07..5258dd86dbd8 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -61,16 +61,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1, } /* - * Pop a record for an inode into the defrag tree. The lock must be held + * Insert a record for an inode into the defrag tree. The lock must be held * already. * * If you're inserting a record for an older transid than an existing record, * the transid already in the tree is lowered. - * - * If an existing record is found the defrag item you pass in is freed. */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) +static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct inode_defrag *entry; @@ -157,7 +155,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * and then re-read this inode, this new inode doesn't have * IN_DEFRAG flag. At the case, we may find the existed defrag. */ - ret = __btrfs_add_inode_defrag(inode, defrag); + ret = btrfs_insert_inode_defrag(inode, defrag); if (ret) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } else { From ffc531652d1039b2c5049a58814d74352f684837 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 03:51:38 +0200 Subject: [PATCH 077/110] btrfs: rename __btrfs_run_defrag_inode() and drop double underscores The function does not follow the pattern where the underscores would be justified, so rename it. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5258dd86dbd8..41d67065d02b 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -231,8 +231,8 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) #define BTRFS_DEFRAG_BATCH 1024 -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) +static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) { struct btrfs_root *inode_root; struct inode *inode; @@ -322,7 +322,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - __btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag); } atomic_dec(&fs_info->defrag_running); From 276940915f232f8569124811fd8a9524f27f5748 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:05:48 +0200 Subject: [PATCH 078/110] btrfs: clear defragmented inodes using postorder in btrfs_cleanup_defrag_inodes() btrfs_cleanup_defrag_inodes() is not called frequently, only in remount or unmount, but the way it frees the inodes in fs_info->defrag_inodes is inefficient. Each time it needs to locate first node, remove it, potentially rebalance tree until it's done. This allows to do a conditional reschedule. For cleanups the rbtree_postorder_for_each_entry_safe() iterator is convenient but we can't reschedule and restart iteration because some of the tree nodes would be already freed. The cleanup operation is kmem_cache_free() which will likely take the fast path for most objects so rescheduling should not be necessary. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 41d67065d02b..89f51252d25c 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -212,20 +212,14 @@ out: void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) { - struct inode_defrag *defrag; - struct rb_node *node; + struct inode_defrag *defrag, *next; spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); + + rbtree_postorder_for_each_entry_safe(defrag, next, + &fs_info->defrag_inodes, rb_node) kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } spin_unlock(&fs_info->defrag_inodes_lock); } From 91c9f2855ead841b27eefb8968079290725d4f2e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:10:11 +0200 Subject: [PATCH 079/110] btrfs: return void from btrfs_add_inode_defrag() The potential memory allocation failure is not a fatal error, skipping autodefrag is fine and the caller inode_should_defrag() does not care about the errors. Further writes can attempt to add the inode back to the defragmentation list again. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 14 +++++++------- fs/btrfs/defrag.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 89f51252d25c..6af593a0313d 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -117,10 +117,11 @@ static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) } /* - * Insert a defrag record for this inode if auto defrag is enabled. + * Insert a defrag record for this inode if auto defrag is enabled. No errors + * returned as they're not considered fatal. */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -129,10 +130,10 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, int ret; if (!need_auto_defrag(fs_info)) - return 0; + return; if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; + return; if (trans) transid = trans->transid; @@ -141,7 +142,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) - return -ENOMEM; + return; defrag->ino = btrfs_ino(inode); defrag->transid = transid; @@ -162,7 +163,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } spin_unlock(&fs_info->defrag_inodes_lock); - return 0; } /* diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 878528e086fb..97f36ab3f24d 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); From 11e3107d47cb266a284169f36c2293af3f397fdb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:13:44 +0200 Subject: [PATCH 080/110] btrfs: drop transaction parameter from btrfs_add_inode_defrag() There's only one caller inode_should_defrag() that passes NULL to btrfs_add_inode_defrag() so we can drop it an simplify the code. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 11 ++--------- fs/btrfs/defrag.h | 3 +-- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 6af593a0313d..5b6bf0a59b23 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -120,13 +120,11 @@ static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) * Insert a defrag record for this inode if auto defrag is enabled. No errors * returned as they're not considered fatal. */ -void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct inode_defrag *defrag; - u64 transid; int ret; if (!need_auto_defrag(fs_info)) @@ -135,17 +133,12 @@ void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) return; - if (trans) - transid = trans->transid; - else - transid = btrfs_get_root_last_trans(root); - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); if (!defrag) return; defrag->ino = btrfs_ino(inode); - defrag->transid = transid; + defrag->transid = btrfs_get_root_last_trans(root); defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 97f36ab3f24d..6b7596c4f0dc 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -18,8 +18,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); -void btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); +void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index efe75b03d5f1..e96b63d7e8fd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -885,7 +885,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) - btrfs_add_inode_defrag(NULL, inode, small_write); + btrfs_add_inode_defrag(inode, small_write); } static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) From df2825e98507d10cb037a308087ecd7cb3f6688d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 27 Aug 2024 04:26:51 +0200 Subject: [PATCH 081/110] btrfs: always pass readahead state to defrag Defrag ioctl passes readahead from the file, but autodefrag does not have a file so the readahead state is allocated when needed. The autodefrag loop in cleaner thread iterates over inodes so we can simply provide an on-stack readahead state and will not need to allocate it in btrfs_defrag_file(). The size is 32 bytes which is acceptable. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5b6bf0a59b23..acf1f39e45d0 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -219,7 +219,8 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) #define BTRFS_DEFRAG_BATCH 1024 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) + struct inode_defrag *defrag, + struct file_ra_state *ra) { struct btrfs_root *inode_root; struct inode *inode; @@ -258,9 +259,10 @@ again: range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; + file_ra_state_init(ra, inode->i_mapping); sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); iput(inode); @@ -287,6 +289,8 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) atomic_inc(&fs_info->defrag_running); while (1) { + struct file_ra_state ra = { 0 }; + /* Pause the auto defragger. */ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) break; @@ -309,7 +313,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; root_objectid = defrag->root; - btrfs_run_defrag_inode(fs_info, defrag); + btrfs_run_defrag_inode(fs_info, defrag, &ra); } atomic_dec(&fs_info->defrag_running); @@ -1302,8 +1306,7 @@ static int defrag_one_cluster(struct btrfs_inode *inode, if (entry->start + range_len <= *last_scanned_ret) continue; - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, + page_cache_sync_readahead(inode->vfs_inode.i_mapping, ra, NULL, entry->start >> PAGE_SHIFT, ((entry->start + range_len - 1) >> PAGE_SHIFT) - (entry->start >> PAGE_SHIFT) + 1); @@ -1335,7 +1338,7 @@ out: * Entry point to file defragmentation. * * @inode: inode to be defragged - * @ra: readahead state (can be NUL) + * @ra: readahead state * @range: defrag options including range and flags * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode @@ -1357,12 +1360,13 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); - bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; + ASSERT(ra); + if (isize == 0) return 0; @@ -1391,18 +1395,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cur = round_down(range->start, fs_info->sectorsize); last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - /* * Make writeback start from the beginning of the range, so that the * defrag range can be written sequentially. @@ -1457,8 +1449,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cond_resched(); } - if (ra_allocated) - kfree(ra); /* * Update range.start for autodefrag, this will indicate where to start * in next run. From 7e2a59508472edae5557ff67c2f61f911148be2d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Aug 2024 14:15:52 -0400 Subject: [PATCH 082/110] btrfs: introduce EXTENT_DIO_LOCKED In order to support dropping the extent lock during a read we need a way to make sure that direct reads and direct writes for overlapping ranges are protected from each other. To accomplish this introduce another lock bit specifically for direct io. Subsequent patches will utilize this to protect direct IO operations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 55 +++++++++++++++++---------------------- fs/btrfs/extent-io-tree.h | 38 ++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index c54c5d7a5cd5..6d08c100b01d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -126,7 +126,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * Empty an io tree, removing and freeing every extent state record from the * tree. This should be called once we are sure no other task can access the * tree anymore, so no tree updates happen after we empty the tree and there - * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ void extent_io_tree_release(struct extent_io_tree *tree) @@ -141,7 +141,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); - ASSERT(!(state->state & EXTENT_LOCKED)); + ASSERT(!(state->state & EXTENT_LOCK_BITS)); /* * No need for a memory barrier here, as we are holding the tree * lock and we only change the waitqueue while holding that lock @@ -399,7 +399,7 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + if (state->state & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) return; merge_prev_state(tree, state); @@ -445,7 +445,7 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, struct rb_node *parent = NULL; const u64 start = state->start - 1; const u64 end = state->end + 1; - const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); + const bool try_merge = !(bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -616,9 +616,6 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. * - * Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given - * range from the tree regardless of state (ie for truncate). - * * The range [start, end] is inclusive. * * This takes the tree lock, and returns 0 on success and < 0 on error. @@ -647,8 +644,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCKED) ? 1 : 0; - if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) + wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); + if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) clear = 1; again: if (!prealloc) { @@ -861,8 +858,7 @@ static void cache_state_if_flags(struct extent_state *state, static void cache_state(struct extent_state *state, struct extent_state **cached_ptr) { - return cache_state_if_flags(state, cached_ptr, - EXTENT_LOCKED | EXTENT_BOUNDARY); + return cache_state_if_flags(state, cached_ptr, EXTENT_LOCK_BITS | EXTENT_BOUNDARY); } /* @@ -1063,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int ret = 0; u64 last_start; u64 last_end; - u32 exclusive_bits = (bits & EXTENT_LOCKED); + u32 exclusive_bits = (bits & EXTENT_LOCK_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -1812,12 +1808,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * We don't support EXTENT_LOCKED yet, as current changeset will - * record any bits changed, so for EXTENT_LOCKED case, it will - * either fail with -EEXIST or changeset will record the whole - * range. + * We don't support EXTENT_LOCK_BITS yet, as current changeset will + * record any bits changed, so for EXTENT_LOCK_BITS case, it will either + * fail with -EEXIST or changeset will record the whole range. */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } @@ -1826,26 +1821,25 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset) { /* - * Don't support EXTENT_LOCKED case, same reason as + * Don't support EXTENT_LOCK_BITS case, same reason as * set_record_extent_bits(). */ - ASSERT(!(bits & EXTENT_LOCKED)); + ASSERT(!(bits & EXTENT_LOCK_BITS)); return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached); + clear_extent_bit(tree, start, failed_start - 1, bits, cached); return 0; } return 1; @@ -1855,23 +1849,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state) +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, cached_state); + bits, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, - &failed_state); - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, + wait_extent_bit(tree, failed_start, end, bits, &failed_state); + err = __set_extent_bit(tree, start, end, bits, &failed_start, &failed_state, cached_state, NULL); } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 9d3a52d8f59a..6ffef1cd37c1 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -19,6 +19,7 @@ enum { ENUM_BIT(EXTENT_DIRTY), ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), ENUM_BIT(EXTENT_DELALLOC), ENUM_BIT(EXTENT_DEFRAG), @@ -67,6 +68,8 @@ enum { EXTENT_ADD_INODE_BYTES | \ EXTENT_CLEAR_ALL_BITS) +#define EXTENT_LOCK_BITS (EXTENT_LOCKED | EXTENT_DIO_LOCKED) + /* * Redefined bits above which are used only in the device allocation tree, * shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV @@ -134,12 +137,22 @@ const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tre void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); +int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached); +static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); +} int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); @@ -212,5 +225,22 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); +static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); +} + +static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) +{ + return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); +} #endif /* BTRFS_EXTENT_IO_TREE_H */ From 07d399cb4e1881bba39910bcb4de68a5bd633e03 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 Aug 2024 16:50:01 -0400 Subject: [PATCH 083/110] btrfs: take the dio extent lock during O_DIRECT operations Currently we hold the extent lock for the entire duration of a read. This isn't really necessary in the buffered case, we're protected by the page lock, however it's necessary for O_DIRECT. For O_DIRECT reads, if we only locked the extent for the part where we get the extent, we could potentially race with an O_DIRECT write in the same region. This isn't really a problem, unless the read is delayed so much that the write does the COW, unpins the old extent, and some other application re-allocates the extent before the read is actually able to be submitted. At that point at best we'd have a checksum mismatch, but at worse we could read data that doesn't belong to us. To address this potential race we need to make sure we don't have overlapping, concurrent direct io reads and writes. To accomplish this use the new EXTENT_DIO_LOCKED bit in the direct IO case in the same spot as the current extent lock. The writes will take this while they're creating the ordered extent, which is also used to make sure concurrent buffered reads or concurrent direct reads are not allowed to occur, and drop it after the ordered extent is taken. For reads it will act as the current read behavior for the EXTENT_LOCKED bit, we set it when we're starting the read, we clear it in the end_io to allow other direct writes to continue. This still has the drawback of disallowing concurrent overlapping direct reads from occurring, but that exists with the current extent locking. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/direct-io.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index ea7f918b1c45..4a5f9b2632f2 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct btrfs_ordered_extent *ordered; int ret = 0; + /* Direct lock must be taken before the extent lock. */ + if (nowait) { + if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + return -EAGAIN; + } else { + lock_dio_extent(io_tree, lockstart, lockend, cached_state); + } + while (1) { if (nowait) { if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) - return -EAGAIN; + cached_state)) { + ret = -EAGAIN; + break; + } } else { lock_extent(io_tree, lockstart, lockend, cached_state); } @@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cond_resched(); } + if (ret) + unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -546,8 +558,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, } if (unlock_extents) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, + &cached_state); else free_extent_state(cached_state); @@ -572,8 +585,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, return 0; unlock_err: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + /* + * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget + * to update this, be explicit that we expect EXTENT_LOCKED and + * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -596,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - NULL); + clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); return 0; } @@ -608,8 +626,9 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); ret = -ENOTBLK; } if (write) { @@ -641,8 +660,9 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + clear_extent_bit(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); } bbio->bio.bi_private = bbio->private; From ac325fc2aad513072722387a71bf857c938aae4e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Aug 2024 15:16:24 -0400 Subject: [PATCH 084/110] btrfs: do not hold the extent lock for entire read Historically we've held the extent lock throughout the entire read. There's been a few reasons for this, but it's mostly just caused us problems. For example, this prevents us from allowing page faults during direct io reads, because we could deadlock. This has forced us to only allow 4k reads at a time for io_uring NOWAIT requests because we have no idea if we'll be forced to page fault and thus have to do a whole lot of work. On the buffered side we are protected by the page lock, as long as we're reading things like buffered writes, punch hole, and even direct IO to a certain degree will get hung up on the page lock while the page is in flight. On the direct side we have the dio extent lock, which acts much like the way the extent lock worked previously to this patch, however just for direct reads. This protects direct reads from concurrent direct writes, while we're protected from buffered writes via the inode lock. Now that we're protected in all cases, narrow the extent lock to the part where we're getting the extent map to submit the reads, no longer holding the extent lock for the entire read operation. Push the extent lock down into do_readpage() so that we're only grabbing it when looking up the extent map. This portion was contributed by Goldwyn. Co-developed-by: Goldwyn Rodrigues Reviewed-by: Goldwyn Rodrigues Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/direct-io.c | 49 +++++++++++----------- fs/btrfs/extent_io.c | 94 ++---------------------------------------- 3 files changed, 29 insertions(+), 116 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 39cd2ed1974b..52952745d44a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -521,6 +521,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, } add_size = min(em->start + em->len, page_end + 1) - cur; free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); if (folio->index == end_index) { size_t zero_offset = offset_in_folio(folio, isize); @@ -534,7 +535,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (!bio_add_folio(orig_bio, folio, add_size, offset_in_folio(folio, cur))) { - unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 4a5f9b2632f2..bd38df5647e3 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -365,7 +365,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; const u64 data_alloc_len = length; - bool unlock_extents = false; + u32 unlock_bits = EXTENT_LOCKED; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if @@ -526,7 +526,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, &len, flags); if (ret < 0) goto unlock_err; - unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { @@ -547,23 +546,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, release_offset, release_len); } - } else { - /* - * We need to unlock only the end area that we aren't using. - * The rest is going to be unlocked by the endio routine. - */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; } - if (unlock_extents) - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, - &cached_state); - else - free_extent_state(cached_state); - /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does @@ -582,6 +566,23 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->length = len; free_extent_map(em); + /* + * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, + * writes only hold it for this part. We hold the extent lock until + * we're completely done with the extent map to make sure it remains + * valid. + */ + if (write) + unlock_bits |= EXTENT_DIO_LOCKED; + + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); + + /* We didn't use everything, unlock the dio extent for the remainder. */ + if (!write && (start + len) < lockend) + unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); + return 0; unlock_err: @@ -614,8 +615,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -626,9 +627,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -660,9 +660,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - clear_extent_bit(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL); + unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f7a388529c17..bd1a7b2fc71a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -480,75 +480,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) bio_put(bio); } -/* - * Record previously processed extent range - * - * For endio_readpage_release_extent() to handle a full extent range, reducing - * the extent io operations. - */ -struct processed_extent { - struct btrfs_inode *inode; - /* Start of the range in @inode */ - u64 start; - /* End of the range in @inode */ - u64 end; - bool uptodate; -}; - -/* - * Try to release processed extent range - * - * May not release the extent range right now if the current range is - * contiguous to processed extent. - * - * Will release processed extent when any of @inode, @uptodate, the range is - * no longer contiguous to the processed range. - * - * Passing @inode == NULL will force processed extent to be released. - */ -static void endio_readpage_release_extent(struct processed_extent *processed, - struct btrfs_inode *inode, u64 start, u64 end, - bool uptodate) -{ - struct extent_state *cached = NULL; - struct extent_io_tree *tree; - - /* The first extent, initialize @processed */ - if (!processed->inode) - goto update; - - /* - * Contiguous to processed extent, just uptodate the end. - * - * Several things to notice: - * - * - bio can be merged as long as on-disk bytenr is contiguous - * This means we can have page belonging to other inodes, thus need to - * check if the inode still matches. - * - bvec can contain range beyond current page for multi-page bvec - * Thus we need to do processed->end + 1 >= start check - */ - if (processed->inode == inode && processed->uptodate == uptodate && - processed->end + 1 >= start && end >= processed->end) { - processed->end = end; - return; - } - - tree = &processed->inode->io_tree; - /* - * Now we don't have range contiguous to the processed range, release - * the processed range now. - */ - unlock_extent(tree, processed->start, processed->end, &cached); - -update: - /* Update processed to current range */ - processed->inode = inode; - processed->start = start; - processed->end = end; - processed->uptodate = uptodate; -} - static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); @@ -575,7 +506,6 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) { struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; - struct processed_extent processed = { 0 }; struct folio_iter fi; const u32 sectorsize = fs_info->sectorsize; @@ -640,11 +570,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) /* Update page status and unlock. */ end_folio_read(folio, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); } - /* Release the last extent */ - endio_readpage_release_extent(&processed, NULL, 0, 0, false); bio_put(bio); } @@ -973,6 +899,7 @@ static struct extent_map *__get_extent_map(struct inode *inode, u64 len, struct extent_map **em_cached) { struct extent_map *em; + struct extent_state *cached_state = NULL; ASSERT(em_cached); @@ -988,12 +915,15 @@ static struct extent_map *__get_extent_map(struct inode *inode, *em_cached = NULL; } + btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), start, start + len - 1, &cached_state); em = btrfs_get_extent(BTRFS_I(inode), folio, start, len); if (!IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; } + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + return em; } /* @@ -1019,11 +949,9 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = fs_info->sectorsize; - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_extent(tree, start, end, NULL); folio_unlock(folio); return ret; } @@ -1047,14 +975,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (cur >= last_byte) { iosize = folio_size(folio) - pg_offset; folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); break; } em = __get_extent_map(inode, folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { - unlock_extent(tree, cur, end, NULL); end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } @@ -1123,7 +1049,6 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (block_start == EXTENT_MAP_HOLE) { folio_zero_range(folio, pg_offset, iosize); - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; @@ -1131,7 +1056,6 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, cur, cur + iosize - 1, NULL); end_folio_read(folio, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; @@ -1156,15 +1080,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, int btrfs_read_folio(struct file *file, struct folio *folio) { - struct btrfs_inode *inode = folio_to_inode(folio); - u64 start = folio_pos(folio); - u64 end = start + folio_size(folio) - 1; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); free_extent_map(em_cached); @@ -2337,15 +2256,10 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; - struct btrfs_inode *inode = BTRFS_I(rac->mapping->host); struct folio *folio; - u64 start = readahead_pos(rac); - u64 end = start + readahead_length(rac) - 1; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); From 9ca0e58cb752b09816f56f7a3147a39773d5e831 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 24 Aug 2024 19:36:43 +0930 Subject: [PATCH 085/110] btrfs: merge btrfs_orig_bbio_end_io() into btrfs_bio_end_io() There are only two differences between the two functions: - btrfs_orig_bbio_end_io() does extra error propagation This is mostly to allow tolerance for write errors. - btrfs_orig_bbio_end_io() does extra pending_ios check This check can handle both the original bio, or the cloned one. (All accounting happens in the original one). This makes btrfs_orig_bbio_end_io() a much safer call. In fact we already had a double freeing error due to usage of btrfs_bio_end_io() in the error path of btrfs_submit_chunk(). So just move the whole content of btrfs_orig_bbio_end_io() into btrfs_bio_end_io(). For normal paths this brings no change, because they are already calling btrfs_orig_bbio_end_io() in the first place. For error paths (not only inside bio.c but also external callers), this change will introduce extra checks, especially for external callers, as they will error out without submitting the btrfs bio. But considering it's already in the error path, such slower but much safer checks are still an overall win. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d5dcc356df33..ce13416bc10f 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -120,12 +120,6 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio) } } -void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - __btrfs_bio_end_io(bbio); -} - static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -147,8 +141,9 @@ static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, } } -static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -179,7 +174,7 @@ static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) static void btrfs_repair_done(struct btrfs_failed_bio *fbio) { if (atomic_dec_and_test(&fbio->repair_count)) { - btrfs_orig_bbio_end_io(fbio->bbio); + btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status); mempool_free(fbio, &btrfs_failed_bio_pool); } } @@ -326,7 +321,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de if (fbio) btrfs_repair_done(fbio); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) @@ -360,7 +355,7 @@ static void btrfs_end_bio_work(struct work_struct *work) if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -380,7 +375,7 @@ static void btrfs_simple_end_io(struct bio *bio) } else { if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } } @@ -394,7 +389,7 @@ static void btrfs_raid56_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -424,7 +419,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); btrfs_put_bioc(bioc); } @@ -593,7 +588,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_orig_bbio_end_io(async->bbio); + btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); return; } @@ -768,11 +763,9 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - remaining->bio.bi_status = ret; - btrfs_orig_bbio_end_io(remaining); + btrfs_bio_end_io(remaining, ret); } - bbio->bio.bi_status = ret; - btrfs_orig_bbio_end_io(bbio); + btrfs_bio_end_io(bbio, ret); /* Do not submit another chunk */ return true; } From fd1e75d0105d2289e24b6d63cee49e6f7952c8bf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 May 2024 17:03:47 +0930 Subject: [PATCH 086/110] btrfs: make compression path to be subpage compatible Currently btrfs compression path is not really subpage compatible, every thing is still done in page unit. That's fine for regular sector size and subpage routine. As even for subpage routine compression is only enabled if the whole range is page aligned, so reading the page cache in page unit is totally fine. However in preparation for the future subpage perfect compression support, we need to change the compression routine to properly handle a subpage range. This patch would prepare both zlib and zstd to only read the subpage range for compression. Lzo is already doing subpage aware read, as lzo's on-disk format is already sectorsize dependent. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 8 ++++++++ fs/btrfs/zlib.c | 19 ++++++++++++++++--- fs/btrfs/zstd.c | 19 +++++++++++++------ 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index cfdc64319186..5d01f092ae13 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -82,6 +82,14 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } +/* @range_end must be exclusive. */ +static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +{ + u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + + return min(range_end, page_end) - cur; +} + int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 30971dd741e2..ee80d064b954 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,6 +20,8 @@ #include #include "btrfs_inode.h" #include "compression.h" +#include "fs.h" +#include "subpage.h" /* workspace buffer size for s390 zlib hardware support */ #define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) @@ -108,6 +110,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; + const u64 orig_end = start + len; *out_folios = 0; *total_out = 0; @@ -153,6 +156,10 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, if (in_buf_folios > 1) { int i; + /* S390 hardware acceleration path, not subpage. */ + ASSERT(!btrfs_is_subpage( + inode_to_fs_info(mapping->host), + mapping)); for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); @@ -167,9 +174,14 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; + workspace->strm.avail_in = + (in_buf_folios << PAGE_SHIFT); } workspace->strm.next_in = workspace->buf; } else { + unsigned int pg_off; + unsigned int cur_len; + if (data_in) { kunmap_local(data_in); folio_put(in_folio); @@ -179,12 +191,13 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - data_in = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + data_in = kmap_local_folio(in_folio, pg_off); start += PAGE_SIZE; workspace->strm.next_in = data_in; + workspace->strm.avail_in = cur_len; } - workspace->strm.avail_in = min(bytes_left, - (unsigned long) workspace->buf_size); } ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 2a079561b2b1..05cf7cebc17c 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -389,7 +389,10 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; + const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; + unsigned int pg_off; + unsigned int cur_len; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -415,9 +418,11 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(); @@ -494,14 +499,16 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); - start += PAGE_SIZE; - len -= PAGE_SIZE; + start += cur_len; + len -= cur_len; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - workspace->in_buf.src = kmap_local_folio(in_folio, 0); + pg_off = offset_in_page(start); + cur_len = btrfs_calc_input_length(orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); workspace->in_buf.pos = 0; - workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); + workspace->in_buf.size = cur_len; } } while (1) { From 266a9361a4cb196ef5e5b4bb0a6c7f8bf2139825 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:55 +0800 Subject: [PATCH 087/110] btrfs: convert clear_page_extent_mapped() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Now clear_page_extent_mapped() can deal with a folio directly, so change its name to clear_folio_extent_mapped(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bd1a7b2fc71a..8b5f36474ae1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -877,18 +877,17 @@ int set_folio_extent_mapped(struct folio *folio) return 0; } -void clear_page_extent_mapped(struct page *page) +void clear_folio_extent_mapped(struct folio *folio) { - struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; - ASSERT(page->mapping); + ASSERT(folio->mapping); if (!folio_test_private(folio)) return; - fs_info = page_to_fs_info(page); - if (btrfs_is_subpage(fs_info, page->mapping)) + fs_info = folio_to_fs_info(folio); + if (btrfs_is_subpage(fs_info, folio->mapping)) return btrfs_detach_subpage(fs_info, folio); folio_detach_private(folio); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b38460279b99..1d9b30021109 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -249,7 +249,7 @@ int btree_write_cache_pages(struct address_space *mapping, void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); int set_page_extent_mapped(struct page *page); -void clear_page_extent_mapped(struct page *page); +void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e96b63d7e8fd..5e3c772eed2b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7240,7 +7240,7 @@ static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (try_release_extent_mapping(&folio->page, gfp_flags)) { wait_subpage_spinlock(folio); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); return true; } return false; @@ -7438,7 +7438,7 @@ next: btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); - clear_page_extent_mapped(&folio->page); + clear_folio_extent_mapped(folio); } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) From d4aeb5f7a7e67d780e3eaae0b6e7d4e2d31042ee Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:56 +0800 Subject: [PATCH 088/110] btrfs: convert get_next_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Use folio_pos instead of page_offset, which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8b5f36474ae1..65b6b391f283 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4041,17 +4041,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, #define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) + const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) { struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; - u64 page_start = page_offset(page); - u64 cur = page_start; + u64 folio_start = folio_pos(folio); + u64 cur = folio_start; - ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { + while (cur < folio_start + PAGE_SIZE) { int ret; int i; @@ -4063,7 +4063,7 @@ static struct extent_buffer *get_next_extent_buffer( goto out; for (i = 0; i < ret; i++) { /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) + if (gang[i]->start >= folio_start + PAGE_SIZE) goto out; /* Found one */ if (gang[i]->start >= bytenr) { @@ -4096,7 +4096,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page, cur); + eb = get_next_extent_buffer(fs_info, page_folio(page), cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); From 0145aa38cb39a041025747d02c276ac9a9acece2 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:57 +0800 Subject: [PATCH 089/110] btrfs: convert try_release_subpage_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And use folio_pos instead of page_offset, which is more consistent with folio usage. At the same time, folio_test_private() can handle folio directly without converting from page to folio first. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65b6b391f283..f5508cfb36d9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4077,11 +4077,11 @@ out: return found; } -static int try_release_subpage_extent_buffer(struct page *page) +static int try_release_subpage_extent_buffer(struct folio *folio) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - u64 cur = page_offset(page); - const u64 end = page_offset(page) + PAGE_SIZE; + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); + u64 cur = folio_pos(folio); + const u64 end = cur + PAGE_SIZE; int ret; while (cur < end) { @@ -4096,7 +4096,7 @@ static int try_release_subpage_extent_buffer(struct page *page) * with spinlock rather than RCU. */ spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, page_folio(page), cur); + eb = get_next_extent_buffer(fs_info, folio, cur); if (!eb) { /* No more eb in the page range after or at cur */ spin_unlock(&fs_info->buffer_lock); @@ -4137,12 +4137,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->i_private_lock); - if (!folio_test_private(page_folio(page))) + spin_lock(&folio->mapping->i_private_lock); + if (!folio_test_private(folio)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return ret; } @@ -4153,7 +4153,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page); + return try_release_subpage_extent_buffer(page_folio(page)); /* * We need to make sure nobody is changing folio private, as we rely on From b8ae2bfa685f1ba48d42660b163b7bec725fe697 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:58 +0800 Subject: [PATCH 090/110] btrfs: convert try_release_extent_buffer() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 15 +++++++-------- fs/btrfs/extent_io.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 612460e07b2e..25d768e67e37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -525,7 +525,7 @@ static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; - return try_release_extent_buffer(&folio->page); + return try_release_extent_buffer(folio); } static void btree_invalidate_folio(struct folio *folio, size_t offset, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f5508cfb36d9..f8b001053d05 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4147,21 +4147,20 @@ static int try_release_subpage_extent_buffer(struct folio *folio) } -int try_release_extent_buffer(struct page *page) +int try_release_extent_buffer(struct folio *folio) { - struct folio *folio = page_folio(page); struct extent_buffer *eb; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return try_release_subpage_extent_buffer(page_folio(page)); + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + return try_release_subpage_extent_buffer(folio); /* * We need to make sure nobody is changing folio private, as we rely on * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 1; } @@ -4176,10 +4175,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1d9b30021109..345774c84c4b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -237,7 +237,7 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) } bool try_release_extent_mapping(struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page); +int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); void extent_write_locked_range(struct inode *inode, const struct folio *locked_folio, From 884937793db595928961397dd3ec2287b40371c6 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:28:59 +0800 Subject: [PATCH 091/110] btrfs: convert read_key_bytes() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use kmap_local_folio() instead of kmap_local_page(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/verity.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 4042dd6437ae..e36dc99021a0 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -284,7 +284,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * page and ignore dest, but it must still be non-NULL to avoid the * counting-only behavior. * @len: length in bytes to read - * @dest_page: copy into this page instead of the dest buffer + * @dest_folio: copy into this folio instead of the dest buffer * * Helper function to read items from the btree. This returns the number of * bytes read or < 0 for errors. We can return short reads if the items don't @@ -294,7 +294,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, * Returns number of bytes read or a negative error code on failure. */ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, - char *dest, u64 len, struct page *dest_page) + char *dest, u64 len, struct folio *dest_folio) { struct btrfs_path *path; struct btrfs_root *root = inode->root; @@ -314,7 +314,7 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, if (!path) return -ENOMEM; - if (dest_page) + if (dest_folio) path->reada = READA_FORWARD; key.objectid = btrfs_ino(inode); @@ -371,15 +371,15 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, copy_offset = offset - key.offset; if (dest) { - if (dest_page) - kaddr = kmap_local_page(dest_page); + if (dest_folio) + kaddr = kmap_local_folio(dest_folio, 0); data = btrfs_item_ptr(leaf, path->slots[0], void); read_extent_buffer(leaf, kaddr + dest_offset, (unsigned long)data + copy_offset, copy_bytes); - if (dest_page) + if (dest_folio) kunmap_local(kaddr); } @@ -762,7 +762,7 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - folio_address(folio), PAGE_SIZE, &folio->page); + folio_address(folio), PAGE_SIZE, folio); if (ret < 0) { folio_put(folio); return ERR_PTR(ret); From 135873258c6127077e2b0db83ddd08e3e4215b3b Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:00 +0800 Subject: [PATCH 092/110] btrfs: convert submit_eb_subpage() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use folio_pos() instead of page_offset(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f8b001053d05..f16c5be22849 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1735,12 +1735,11 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) +static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) { - struct btrfs_fs_info *fs_info = page_to_fs_info(page); - struct folio *folio = page_folio(page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); int submitted = 0; - u64 page_start = page_offset(page); + u64 folio_start = folio_pos(folio); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; @@ -1755,21 +1754,21 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->i_private_lock); + spin_lock(&folio->mapping->i_private_lock); if (!folio_test_private(folio)) { - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); bit_start++; continue; } - start = page_start + bit_start * fs_info->sectorsize; + start = folio_start + bit_start * fs_info->sectorsize; bit_start += sectors_per_node; /* @@ -1778,7 +1777,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->i_private_lock); + spin_unlock(&folio->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1829,7 +1828,7 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) return 0; if (page_to_fs_info(page)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc); + return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { From 08dd8507b11684427f5d0f07f18f1b1fb2d9f28a Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:01 +0800 Subject: [PATCH 093/110] btrfs: convert submit_eb_page() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f16c5be22849..14d2cc71fffd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1816,18 +1816,17 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) +static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = page->mapping; - struct folio *folio = page_folio(page); + struct address_space *mapping = folio->mapping; struct extent_buffer *eb; int ret; if (!folio_test_private(folio)) return 0; - if (page_to_fs_info(page)->nodesize < PAGE_SIZE) + if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) return submit_eb_subpage(folio, wbc); spin_lock(&mapping->i_private_lock); @@ -1926,7 +1925,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &ctx); + ret = submit_eb_page(folio, &ctx); if (ret == 0) continue; if (ret < 0) { From dd0a8df455665fe896125e15dfe3847f1e18462f Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:02 +0800 Subject: [PATCH 094/110] btrfs: convert try_release_extent_state() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover, use folio_pos() instead of page_offset(), which is more consistent with folio usage. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14d2cc71fffd..430dc1b9de32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2303,9 +2303,9 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * to drop the page. */ static bool try_release_extent_state(struct extent_io_tree *tree, - struct page *page, gfp_t mask) + struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; bool ret; @@ -2414,7 +2414,7 @@ next: cond_resched(); } } - return try_release_extent_state(io_tree, page, mask); + return try_release_extent_state(io_tree, folio, mask); } static void __free_extent_buffer(struct extent_buffer *eb) From 046c0d65962504d8ec1e109cb673c81ba36da1e3 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:03 +0800 Subject: [PATCH 095/110] btrfs: convert try_release_extent_mapping() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And page_to_inode() can be replaced with folio_to_inode() now. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 430dc1b9de32..c07930986fe5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2341,11 +2341,11 @@ static bool try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -bool try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { - u64 start = page_offset(page); + u64 start = folio_pos(folio); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *inode = page_to_inode(page); + struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; while (start <= end) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 345774c84c4b..8a36117ed453 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -236,7 +236,7 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -bool try_release_extent_mapping(struct page *page, gfp_t mask); +bool try_release_extent_mapping(struct folio *folio, gfp_t mask); int try_release_extent_buffer(struct folio *folio); int btrfs_read_folio(struct file *file, struct folio *folio); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e3c772eed2b..1483e33127b7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7238,7 +7238,7 @@ static int btrfs_launder_folio(struct folio *folio) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (try_release_extent_mapping(&folio->page, gfp_flags)) { + if (try_release_extent_mapping(folio, gfp_flags)) { wait_subpage_spinlock(folio); clear_folio_extent_mapped(folio); return true; From 54c78d497b383f5828b019d41149a9e76cfc771c Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:04 +0800 Subject: [PATCH 096/110] btrfs: convert zlib_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/zlib.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 52952745d44a..e97692fdfab0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -142,7 +142,7 @@ static int compression_decompress(int type, struct list_head *ws, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, dest_pgoff, srclen, destlen); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 5d01f092ae13..f4f7a981cb90 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -162,7 +162,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ee80d064b954..100abc00b794 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -393,7 +393,7 @@ done: } int zlib_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -421,12 +421,12 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, ret = zlib_inflateInit2(&workspace->strm, wbits); if (unlikely(ret != Z_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression init failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); return -EIO; } @@ -439,16 +439,16 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in, if (ret != Z_STREAM_END) goto out; - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, to_copy); out: if (unlikely(to_copy != destlen)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zlib decompression failed, error %d root %llu inode %llu offset %llu decompressed %lu expected %zu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page), to_copy, destlen); + folio_pos(dest_folio), to_copy, destlen); ret = -EIO; } else { ret = 0; @@ -457,7 +457,7 @@ out: zlib_inflateEnd(&workspace->strm); if (unlikely(to_copy < destlen)) - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); return ret; } From 9f9a4e43a87082144e43320edaf38d980d18d069 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:05 +0800 Subject: [PATCH 097/110] btrfs: convert lzo_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/lzo.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e97692fdfab0..a70ca9c94ea7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -144,7 +144,7 @@ static int compression_decompress(int type, struct list_head *ws, switch (type) { case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, dest_pgoff, srclen, destlen); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index f4f7a981cb90..4b5a7ba54815 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -173,7 +173,7 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 1e2a68b8f62d..72856f6775f7 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -438,11 +438,11 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } int lzo_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); const u32 sectorsize = fs_info->sectorsize; size_t in_len; size_t out_len; @@ -467,22 +467,22 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, out_len = sectorsize; ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); if (unlikely(ret != LZO_E_OK)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(fs_info, "lzo decompression failed, error %d root %llu inode %llu offset %llu", ret, btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto out; } ASSERT(out_len <= sectorsize); - memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); /* Early end, considered as an error. */ if (unlikely(out_len < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len); + folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); } out: return ret; From b70f3a45464b012feb8e86f15f37e0c4b2f69fe1 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:06 +0800 Subject: [PATCH 098/110] btrfs: convert zstd_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. And memcpy_to_page() can be replaced with memcpy_to_folio(). But there is no memzero_folio(), but it can be replaced equivalently by folio_zero_range(). Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/compression.h | 2 +- fs/btrfs/zstd.c | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a70ca9c94ea7..a5f7cbe1c06e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -146,7 +146,7 @@ static int compression_decompress(int type, struct list_head *ws, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, page_folio(dest_page), dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4b5a7ba54815..d2453cf28eef 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -183,7 +183,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 05cf7cebc17c..866607fd3e58 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -656,11 +656,11 @@ done: } int zstd_decompress(struct list_head *ws, const u8 *data_in, - struct page *dest_page, unsigned long dest_pgoff, size_t srclen, + struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); + struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; @@ -669,12 +669,12 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), - page_offset(dest_page)); + folio_pos(dest_folio)); ret = -EIO; goto finish; } @@ -693,21 +693,21 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { - struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); + struct btrfs_inode *inode = folio_to_inode(dest_folio); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), - btrfs_ino(inode), page_offset(dest_page)); + btrfs_ino(inode), folio_pos(dest_folio)); goto finish; } to_copy = workspace->out_buf.pos; - memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); + memcpy_to_folio(dest_folio, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; - memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); + folio_zero_range(dest_folio, dest_pgoff + to_copy, destlen - to_copy); } return ret; } From aeb6d8814841ec106acc5ffea772d4102ffc72b6 Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:07 +0800 Subject: [PATCH 099/110] btrfs: convert btrfs_decompress() to take a folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Based on the previous patch, the compression path can be directly used in folio without converting to page. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 +++++++------- fs/btrfs/compression.h | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a5f7cbe1c06e..90aef2627ca2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -138,15 +138,15 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - const u8 *data_in, struct page *dest_page, + const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { switch (type) { - case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); - case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, page_folio(dest_page), + case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio, dest_pgoff, srclen, destlen); case BTRFS_COMPRESS_NONE: default: @@ -1061,10 +1061,10 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) { - struct btrfs_fs_info *fs_info = page_to_fs_info(dest_page); + struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio); struct list_head *workspace; const u32 sectorsize = fs_info->sectorsize; int ret; @@ -1077,7 +1077,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize); workspace = get_workspace(type, 0); - ret = compression_decompress(type, workspace, data_in, dest_page, + ret = compression_decompress(type, workspace, data_in, dest_folio, dest_pgoff, srclen, destlen); put_workspace(type, workspace); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d2453cf28eef..b6563b6a333e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -96,7 +96,7 @@ void __cold btrfs_exit_compress(void); int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1483e33127b7..edac499fd83d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6746,7 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, &folio->page, 0, inline_size, + ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index df6b93b927cd..b768e590a44c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -118,7 +118,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); } else { - ret = btrfs_decompress(comp_type, data_start, page, + ret = btrfs_decompress(comp_type, data_start, page_folio(page), offset_in_page(file_offset), inline_size, datal); if (ret) From faad57ae20190de6375e1c3a7144c7ae66ab4ddf Mon Sep 17 00:00:00 2001 From: Li Zetao Date: Thu, 29 Aug 2024 02:29:08 +0800 Subject: [PATCH 100/110] btrfs: convert copy_inline_to_page() to use folio The old page API is being gradually replaced and converted to use folio to improve code readability and avoid repeated conversion between page and folio. Moreover find_or_create_page() is compatible API, and it can replaced with __filemap_get_folio(). Some interfaces have been converted to use folio before, so the conversion operation from page can be eliminated here. Signed-off-by: Li Zetao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b768e590a44c..f0824c948cb7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -66,7 +66,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; - struct page *page = NULL; + struct folio *folio = NULL; struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; @@ -83,14 +83,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out; - page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(mapping)); - if (!page) { + folio = __filemap_get_folio(mapping, file_offset >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + btrfs_alloc_write_mask(mapping)); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out_unlock; } - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) goto out_unlock; @@ -115,15 +116,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, offset_in_page(file_offset), data_start, - datal); + memcpy_to_folio(folio, offset_in_folio(folio, file_offset), data_start, + datal); } else { - ret = btrfs_decompress(comp_type, data_start, page_folio(page), - offset_in_page(file_offset), + ret = btrfs_decompress(comp_type, data_start, folio, + offset_in_folio(folio, file_offset), inline_size, datal); if (ret) goto out_unlock; - flush_dcache_page(page); + flush_dcache_folio(folio); } /* @@ -139,15 +140,15 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) - memzero_page(page, datal, block_size - datal); + folio_zero_range(folio, datal, block_size - datal); - btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); - btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, folio, file_offset, block_size); + btrfs_folio_clear_checked(fs_info, folio, file_offset, block_size); + btrfs_folio_set_dirty(fs_info, folio, file_offset, block_size); out_unlock: - if (page) { - unlock_page(page); - put_page(page); + if (!IS_ERR(folio)) { + folio_unlock(folio); + folio_put(folio); } if (ret) btrfs_delalloc_release_space(inode, data_reserved, file_offset, From 3368597206dc3c6c3c2247ee146beada14c67380 Mon Sep 17 00:00:00 2001 From: Luca Stefani Date: Mon, 2 Sep 2024 13:10:53 +0200 Subject: [PATCH 101/110] btrfs: always update fstrim_range on failure in FITRIM ioctl Even in case of failure we could've discarded some data and userspace should be made aware of it, so copy fstrim_range to userspace regardless. Also make sure to update the trimmed bytes amount even if btrfs_trim_free_extents fails. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Qu Wenruo Signed-off-by: Luca Stefani Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/ioctl.c | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index feec49e6f9c8..a5966324607d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6551,13 +6551,13 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); + + trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } - - trimmed += group_trimmed; } mutex_unlock(&fs_devices->device_list_mutex); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee01cc828883..8537eb9b5531 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -543,13 +543,11 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); - if (ret < 0) - return ret; if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; - return 0; + return ret; } int __pure btrfs_is_empty_uuid(const u8 *uuid) From 1b6e068a0cc3d3888ddd5e4967357075fd6502da Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 29 Aug 2024 19:03:52 +0100 Subject: [PATCH 102/110] btrfs: add and use helper to verify the calling task has locked the inode We have a few places that check if we have the inode locked by doing: ASSERT(inode_is_locked(vfs_inode)); This actually proved to be useful several times as if assertions are enabled (and by default they are in many distros) it immediately triggers a crash which is impossible for users to miss. However that doesn't check if the lock is held by the calling task, so the check passes if some other task locked the inode. Using one of the lockdep functions to check the lock is held, like lockdep_assert_held() for example, does check that the calling task holds the lock, and if that's not the case it produces a warning and stack trace in dmesg. However, despite the misleading "assert" in the name of the lockdep helpers, it does not trigger a crash/BUG_ON(), just a warning and splat in dmesg, which is easy to get unnoticed by users who may have lockdep enabled. So add a helper that does the ASSERT() and calls lockdep_assert_held() immediately after and use it every where we check the inode is locked. Like this if the lock is held by some other task we get the warning in dmesg which is caught by fstests, very helpful during development, and may also be occassionaly noticed by users with lockdep enabled. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 8 ++++++++ fs/btrfs/file.c | 2 +- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/tree-log.c | 2 +- fs/btrfs/verity.c | 6 +++--- fs/btrfs/xattr.c | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2d7f8da54d8a..90e72031c724 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -505,6 +505,14 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } +static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) +{ + /* Immediately trigger a crash if the inode is not locked. */ + ASSERT(inode_is_locked(&inode->vfs_inode)); + /* Trigger a splat in dmesg if this task is not holding the lock. */ + lockdep_assert_held(&inode->vfs_inode.i_rwsem); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c7a7234998aa..c5e36f58eb07 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1617,7 +1617,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; - lockdep_assert_held(&inode->vfs_inode.i_rwsem); + btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index eb9b32ffbc0c..2104d60c2161 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1015,7 +1015,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, { struct rb_node *n; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); spin_lock_irq(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f0cf8ce26f01..e2ed2a791f8f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2877,7 +2877,7 @@ void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; - ASSERT(inode_is_locked(&ctx->inode->vfs_inode)); + btrfs_assert_inode_locked(ctx->inode); list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { list_del_init(&ordered->log_list); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e36dc99021a0..e97ad824ae16 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -460,7 +460,7 @@ static int rollback_verity(struct btrfs_inode *inode) struct btrfs_root *root = inode->root; int ret; - ASSERT(inode_is_locked(&inode->vfs_inode)); + btrfs_assert_inode_locked(inode); truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); ret = btrfs_drop_verity_items(inode); @@ -585,7 +585,7 @@ static int btrfs_begin_enable_verity(struct file *filp) struct btrfs_trans_handle *trans; int ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; @@ -633,7 +633,7 @@ static int btrfs_end_enable_verity(struct file *filp, const void *desc, int ret = 0; int rollback_ret; - ASSERT(inode_is_locked(file_inode(filp))); + btrfs_assert_inode_locked(inode); if (desc == NULL) goto rollback; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 738c7bb8ea7c..ce464cd8e0ac 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -120,7 +120,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(inode_is_locked(inode)); + btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) From 070969f17d82e4220f5800ea63139e513cdb17fd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Jun 2024 02:28:30 +0200 Subject: [PATCH 103/110] btrfs: rework BTRFS_I as macro to preserve parameter const Currently BTRFS_I is a static inline function that takes a const inode and returns btrfs inode, dropping the 'const' qualifier. This can break assumptions of compiler though it seems there's no real case. To make the parameter and return type consistent regardint const we can use the container_of_const() that preserves it. However this would not check the parameter type. To fix that use the same _Generic construct but implement only the two expected types. Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 90e72031c724..9a4b7c119318 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -350,10 +350,12 @@ static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, WRITE_ONCE(inode->first_dir_index_to_log, index); } -static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) -{ - return container_of(inode, struct btrfs_inode, vfs_inode); -} +/* Type checked and const-preserving VFS inode -> btrfs inode. */ +#define BTRFS_I(_inode) \ + _Generic(_inode, \ + struct inode *: container_of(_inode, struct btrfs_inode, vfs_inode), \ + const struct inode *: (const struct btrfs_inode *)container_of( \ + _inode, const struct btrfs_inode, vfs_inode)) static inline unsigned long btrfs_inode_hash(u64 objectid, const struct btrfs_root *root) From ca283ea9920ac20ae23ed398b693db3121045019 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 26 Jun 2024 23:39:11 +0200 Subject: [PATCH 104/110] btrfs: constify more pointer parameters Continue adding const to parameters. This is for clarity and minor addition to safety. There are some minor effects, in the assembly code and .ko measured on release config. Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 +++--- fs/btrfs/block-group.c | 34 +++++++++++++++++----------------- fs/btrfs/block-group.h | 11 +++++------ fs/btrfs/block-rsv.c | 2 +- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/ctree.c | 18 +++++++++--------- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/discard.c | 4 ++-- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/file-item.h | 2 +- fs/btrfs/inode-item.c | 10 +++++----- fs/btrfs/inode-item.h | 4 ++-- fs/btrfs/space-info.c | 25 ++++++++++++------------- fs/btrfs/space-info.h | 10 +++++----- fs/btrfs/tree-mod-log.c | 14 +++++++------- fs/btrfs/tree-mod-log.h | 6 +++--- fs/btrfs/zoned.c | 2 +- fs/btrfs/zoned.h | 4 ++-- include/trace/events/btrfs.h | 6 +++--- 19 files changed, 84 insertions(+), 86 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a2de5c05f97c..e2f478ecd7fd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -219,8 +219,8 @@ static void free_pref(struct prelim_ref *ref) * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 * indicates a 'higher' block. */ -static int prelim_ref_compare(struct prelim_ref *ref1, - struct prelim_ref *ref2) +static int prelim_ref_compare(const struct prelim_ref *ref1, + const struct prelim_ref *ref2) { if (ref1->level < ref2->level) return -1; @@ -251,7 +251,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount, struct prelim_ref *newref) + int newcount, const struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2e49d978f504..7980b2e33a92 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -23,7 +23,7 @@ #include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -40,9 +40,9 @@ int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) * * Should be called with balance_lock held */ -static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) +static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags) { - struct btrfs_balance_control *bctl = fs_info->balance_ctl; + const struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; if (!bctl) @@ -1415,9 +1415,9 @@ out: } static bool clean_pinned_extents(struct btrfs_trans_handle *trans, - struct btrfs_block_group *bg) + const struct btrfs_block_group *bg) { - struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *prev_trans = NULL; const u64 start = bg->start; const u64 end = start + bg->length - 1; @@ -1756,14 +1756,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } -static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) { if (btrfs_is_zoned(fs_info)) return btrfs_zoned_should_reclaim(fs_info); return true; } -static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed) { const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info); u64 thresh_bytes = mult_perc(bg->length, thresh_pct); @@ -2006,8 +2006,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } -static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, - struct btrfs_path *path) +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, + const struct btrfs_path *path) { struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; @@ -2055,7 +2055,7 @@ out_free_map: static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_key *key) + const struct btrfs_key *key) { struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; @@ -2640,8 +2640,8 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, } static int insert_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 chunk_offset, - u64 start, u64 num_bytes) + const struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; @@ -2817,7 +2817,7 @@ next: * For extent tree v2 we use the block_group_item->chunk_offset to point at our * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. */ -static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset) { u64 div = SZ_1G; u64 index; @@ -3842,8 +3842,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *sinfo, int force) +static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; @@ -4218,7 +4218,7 @@ out: return ret; } -static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) +static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4622,7 +4622,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, return 0; } -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg) +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg) { if (btrfs_is_zoned(bg->fs_info)) return false; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 915111338fc0..36937eeab9b8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -266,7 +266,7 @@ struct btrfs_block_group { u64 reclaim_mark; }; -static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group) { return (block_group->start + block_group->length); } @@ -278,8 +278,7 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); } -static inline bool btrfs_is_block_group_data_only( - struct btrfs_block_group *block_group) +static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) { /* * In mixed mode the fragmentation is expected to be high, lowering the @@ -290,7 +289,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); +int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( @@ -370,7 +369,7 @@ static inline u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return btrfs_get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static inline int btrfs_block_group_done(struct btrfs_block_group *cache) +static inline int btrfs_block_group_done(const struct btrfs_block_group *cache) { smp_mb(); return cache->cached == BTRFS_CACHE_FINISHED || @@ -387,6 +386,6 @@ enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size); int btrfs_use_block_group_size_class(struct btrfs_block_group *bg, enum btrfs_block_group_size_class size_class, bool force_wrong_size_class); -bool btrfs_block_group_should_use_size_class(struct btrfs_block_group *bg); +bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b299b82d676e..a07b9594dc70 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -553,7 +553,7 @@ try_reserve: return ERR_PTR(ret); } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 1f53b967d069..d12b1fac5c74 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -89,7 +89,7 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, +int btrfs_check_trunc_cache_free_space(const struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 451203055bbf..0cc919d15b14 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2564,8 +2564,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * */ static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, int level) + const struct btrfs_path *path, + const struct btrfs_disk_key *key, int level) { int i; struct extent_buffer *t; @@ -2594,7 +2594,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * that the new key won't break the order */ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2660,8 +2660,8 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, * is correct, we only need to bother the last key of @left and the first * key of @right. */ -static bool check_sibling_keys(struct extent_buffer *left, - struct extent_buffer *right) +static bool check_sibling_keys(const struct extent_buffer *left, + const struct extent_buffer *right) { struct btrfs_key left_last; struct btrfs_key right_first; @@ -2928,8 +2928,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static int insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, + const struct btrfs_path *path, + const struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { struct extent_buffer *lower; @@ -4019,7 +4019,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * the front. */ void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end) + const struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4111,7 +4111,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, * make the item pointed to by the path bigger, data_size is the added size. */ void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size) + const struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8568b1a61c4..fd73c284822a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -538,7 +538,7 @@ int btrfs_previous_item(struct btrfs_root *root, int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_path *path, + const struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -572,9 +572,9 @@ bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 data_size); + const struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_path *path, u32 new_size, int from_end); + const struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..e815d165cccc 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -68,7 +68,7 @@ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) + const struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } @@ -80,7 +80,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ -static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +static bool btrfs_run_discard_work(const struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 5c342fe1af61..886749b39672 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -151,7 +151,7 @@ static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) * Calculate the total size needed to allocate for an ordered sum structure * spanning @bytes in the file. */ -static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) +static int btrfs_ordered_sum_size(const struct btrfs_fs_info *fs_info, unsigned long bytes) { return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } @@ -1272,7 +1272,7 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 557dc43d7142..0e13661a71f3 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -74,7 +74,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, + const struct btrfs_file_extent_item *fi, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 316756ff08ac..29572dfaf878 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,7 +14,7 @@ #include "extent-tree.h" #include "file-item.h" -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name) { @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, } struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; @@ -423,9 +423,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root return ret; } -static inline void btrfs_trace_truncate(struct btrfs_inode *inode, - struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, +static inline void btrfs_trace_truncate(const struct btrfs_inode *inode, + const struct extent_buffer *leaf, + const struct btrfs_file_extent_item *fi, u64 offset, int extent_type, int slot) { if (!inode) diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index c4aded82709b..c11b97fdccc4 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -109,11 +109,11 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, +struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf, int slot, const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( - struct extent_buffer *leaf, int slot, u64 ref_objectid, + const struct extent_buffer *leaf, int slot, u64 ref_objectid, const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c691784b4660..d5a9cd8a4fd8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -163,7 +163,7 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included) { ASSERT(s_info); @@ -368,7 +368,7 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) } static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, + const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { u64 profile; @@ -437,7 +437,7 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, } int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -542,8 +542,8 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *info) { const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -844,9 +844,8 @@ static void flush_space(struct btrfs_fs_info *fs_info, return; } -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -871,7 +870,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + const struct btrfs_space_info *space_info) { const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; @@ -1943,7 +1942,7 @@ static u64 calc_unalloc_target(struct btrfs_fs_info *fs_info) * Typically with 10 block groups as the target, the discrete values this comes * out to are 0, 10, 20, ... , 80, 90, and 99. */ -static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) +static int calc_dynamic_reclaim_threshold(const struct btrfs_space_info *space_info) { struct btrfs_fs_info *fs_info = space_info->fs_info; u64 unalloc = atomic64_read(&fs_info->free_chunk_space); @@ -1962,7 +1961,7 @@ static int calc_dynamic_reclaim_threshold(struct btrfs_space_info *space_info) return calc_pct_ratio(want, target); } -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info) +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info) { lockdep_assert_held(&space_info->lock); @@ -1985,7 +1984,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info) return unalloc < data_chunk_size; } -static void do_reclaim_sweep(struct btrfs_fs_info *fs_info, +static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, int raid) { struct btrfs_block_group *bg; @@ -2073,7 +2072,7 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info) return ret; } -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info) +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info) { int raid; struct btrfs_space_info *space_info; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5602026c5e14..efbecc0c5258 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -217,7 +217,7 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); @@ -258,7 +258,7 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, @@ -271,7 +271,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, + const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( @@ -293,7 +293,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes); void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready); bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info); -int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info); -void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info); +int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info); +void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index fa45b5fb9683..b382a4c443d4 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -170,7 +170,7 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, * this until all tree mod log insertions are recorded in the rb tree and then * write unlock fs_info::tree_mod_log_lock. */ -static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) +static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return true; @@ -188,7 +188,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffe /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) + const struct extent_buffer *eb) { if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) return false; @@ -198,7 +198,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, return true; } -static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, +static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { @@ -221,7 +221,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; @@ -258,7 +258,7 @@ out_unlock: return ret; } -static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, +static struct tree_mod_elem *tree_mod_log_alloc_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -278,7 +278,7 @@ static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, return tm; } -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) { @@ -535,7 +535,7 @@ static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, } int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items) diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index ff00c8e8a393..6308c577a4a4 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -37,7 +37,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); -int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, +int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, @@ -47,11 +47,11 @@ struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, int nr_items); -int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, +int btrfs_tree_mod_log_insert_move(const struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items); u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 047e3337852e..71e184120a9b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2459,7 +2459,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); } -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 30b2e48a1cec..7612e6572605 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -89,7 +89,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); -bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); @@ -242,7 +242,7 @@ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } -static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) { return false; } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index e4add61e00f1..bf60ad50011e 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1825,7 +1825,7 @@ TRACE_EVENT(qgroup_update_counters, TRACE_EVENT(qgroup_update_reserve, - TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, + TP_PROTO(const struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qgroup, s64 diff, int type), TP_ARGS(fs_info, qgroup, diff, type), @@ -1851,7 +1851,7 @@ TRACE_EVENT(qgroup_update_reserve, TRACE_EVENT(qgroup_meta_reserve, - TP_PROTO(struct btrfs_root *root, s64 diff, int type), + TP_PROTO(const struct btrfs_root *root, s64 diff, int type), TP_ARGS(root, diff, type), @@ -1874,7 +1874,7 @@ TRACE_EVENT(qgroup_meta_reserve, TRACE_EVENT(qgroup_meta_convert, - TP_PROTO(struct btrfs_root *root, s64 diff), + TP_PROTO(const struct btrfs_root *root, s64 diff), TP_ARGS(root, diff), From ab6eac7c9111b75fca243e2590a17b55e96e9d31 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 30 Aug 2024 16:48:20 +0930 Subject: [PATCH 105/110] btrfs: remove btrfs_folio_end_all_writers() The function btrfs_folio_end_all_writers() is only utilized in extent_writepage() as a way to unlock all subpage range (for both successful submission and error handling). Meanwhile we have a similar function, btrfs_folio_end_writer_lock(). The difference is, btrfs_folio_end_writer_lock() expects a range that is a subset of the already locked range. This limit on btrfs_folio_end_writer_lock() is a little overkilled, preventing it from being utilized for error paths. So here we enhance btrfs_folio_end_writer_lock() to accept a superset of the locked range, and only end the locked subset. This means we can replace btrfs_folio_end_all_writers() with btrfs_folio_end_writer_lock() instead. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/subpage.c | 57 +++++++------------------------------------- fs/btrfs/subpage.h | 1 - 3 files changed, 10 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c07930986fe5..485d88f9947b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1466,7 +1466,8 @@ done: mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_all_writers(inode_to_fs_info(inode), folio); + btrfs_folio_end_writer_lock(inode_to_fs_info(inode), folio, + page_start, PAGE_SIZE); ASSERT(ret <= 0); return ret; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index ca7d2aedfa8d..7fe58c4d9923 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -322,6 +322,8 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len); const int nbits = (len >> fs_info->sectorsize_bits); unsigned long flags; + unsigned int cleared = 0; + int bit = start_bit; bool last; btrfs_subpage_assert(fs_info, folio, start, len); @@ -339,11 +341,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf return true; } - ASSERT(atomic_read(&subpage->writers) >= nbits); - /* The target range should have been locked. */ - ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits)); - bitmap_clear(subpage->bitmaps, start_bit, nbits); - last = atomic_sub_and_test(nbits, &subpage->writers); + for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) { + clear_bit(bit, subpage->bitmaps); + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); spin_unlock_irqrestore(&subpage->lock, flags); return last; } @@ -825,50 +828,6 @@ out: return found; } -/* - * Unlike btrfs_folio_end_writer_lock() which unlocks a specified subpage range, - * this ends all writer locked ranges of a page. - * - * This is for the locked page of extent_writepage(), as the locked page - * can contain several locked subpage ranges. - */ -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio) -{ - struct btrfs_subpage *subpage = folio_get_private(folio); - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - /* The page has no new delalloc range locked on it. Just plain unlock. */ - if (atomic_read(&subpage->writers) == 0) { - folio_unlock(folio); - return; - } - while (cur < folio_start + PAGE_SIZE) { - u64 found_start; - u32 found_len; - bool found; - bool last; - - found = btrfs_subpage_find_writer_locked(fs_info, folio, cur, - &found_start, &found_len); - if (!found) - break; - last = btrfs_subpage_end_and_test_writer(fs_info, folio, - found_start, found_len); - if (last) { - folio_unlock(folio); - break; - } - cur = found_start + found_len; - } -} - #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ { \ const int sectors_per_page = fs_info->sectors_per_page; \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index b67cd5f6539d..f90e0c4f4cab 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -109,7 +109,6 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret); -void btrfs_folio_end_all_writers(const struct btrfs_fs_info *fs_info, struct folio *folio); /* * Template for subpage related operations. From 4c74a32ad323f89ac99b0f147e331f6ead100efa Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:05 -0700 Subject: [PATCH 106/110] btrfs: DEFINE_FREE for struct btrfs_path Add a DEFINE_FREE for struct btrfs_path. This defines a function that can be called using the __free attribute. Define a macro BTRFS_PATH_AUTO_FREE to make the declaration of an auto freeing path very clear. The intended use is to define the auto free of path in cases where the path is allocated somewhere at the beginning and freed either on all error paths or at the end of the function. int func() { BTRFS_PATH_AUTO_FREE(path); if (...) return -ERROR; path = alloc_path(); ... if (...) return -ERROR; ... return 0; } Signed-off-by: Leo Martins [ update changelog ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fd73c284822a..1a44fb9845e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,6 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H +#include "linux/cleanup.h" #include #include #include @@ -84,6 +85,9 @@ struct btrfs_path { unsigned int nowait:1; }; +#define BTRFS_PATH_AUTO_FREE(path_name) \ + struct btrfs_path *path_name __free(btrfs_free_path) = NULL + /* * The state of btrfs root */ @@ -598,6 +602,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); +DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); From 45763a0cbb91ba3a5db928c376c3b0bba3ce9b45 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:06 -0700 Subject: [PATCH 107/110] btrfs: use btrfs_path auto free in zoned.c All cleanup paths lead to btrfs_path_free so path can be defined with the automatic freeing callback in the following functions: - calculate_emulated_zone_size() - calculate_alloc_pointer() Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 71e184120a9b..7fa2920632ba 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -287,7 +287,7 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, /* The emulated zone size is determined from the size of device extent */ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct extent_buffer *leaf; @@ -304,28 +304,21 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (ret > 0) { - ret = -EUCLEAN; - goto out; - } + if (ret > 0) + return -EUCLEAN; } leaf = path->nodes[0]; dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); - ret = 0; - -out: - btrfs_free_path(path); - - return ret; + return 0; } int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) @@ -1211,7 +1204,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; int ret; @@ -1246,7 +1239,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!ret) ret = -EUCLEAN; if (ret < 0) - goto out; + return ret; ret = btrfs_previous_extent_item(root, path, cache->start); if (ret) { @@ -1254,7 +1247,7 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, ret = 0; *offset_ret = 0; } - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -1266,15 +1259,10 @@ static int calculate_alloc_pointer(struct btrfs_block_group *cache, if (!(found_key.objectid >= cache->start && found_key.objectid + length <= cache->start + cache->length)) { - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } *offset_ret = found_key.objectid + length - cache->start; - ret = 0; - -out: - btrfs_free_path(path); - return ret; + return 0; } struct zone_info { From 68f32b9c98522b9689e82627abeb5c10b3501915 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 3 Sep 2024 11:19:07 -0700 Subject: [PATCH 108/110] btrfs: BTRFS_PATH_AUTO_FREE in orphan.c All cleanup paths lead to btrfs_path_free so path can be defined with the automatic freeing callback in the following functions: - btrfs_insert_orphan_item() - btrfs_del_orphan_item() Signed-off-by: Leo Martins Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/orphan.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 6195a2215b8f..9f3ad124104f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -9,9 +9,8 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; - int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -21,16 +20,13 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - - btrfs_free_path(path); - return ret; + return btrfs_insert_empty_item(trans, root, path, &key, 0); } int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -44,15 +40,9 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - if (ret) { /* JDM: Really? */ - ret = -ENOENT; - goto out; - } + return ret; + if (ret) + return -ENOENT; - ret = btrfs_del_item(trans, root, path); - -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } From 49a9907368a4633fe19c477159da7a3199c808ee Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 2 Sep 2024 13:57:08 +0930 Subject: [PATCH 109/110] btrfs: merge btrfs_folio_unlock_writer() into btrfs_folio_end_writer_lock() The function btrfs_folio_unlock_writer() is already calling btrfs_folio_end_writer_lock() to do the heavy lifting work, the only missing 0 writer check. Thus there is no need to keep two different functions, move the 0 writer check into btrfs_folio_end_writer_lock(), and remove btrfs_folio_unlock_writer(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/subpage.c | 81 +++++++++++++++++++------------------------- fs/btrfs/subpage.h | 2 -- 3 files changed, 35 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 485d88f9947b..70be1150c34e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2220,7 +2220,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f cur, cur_len, !ret); mapping_set_error(mapping, ret); } - btrfs_folio_unlock_writer(fs_info, folio, cur, cur_len); + btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); if (ret < 0) found_error = true; next_page: diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 7fe58c4d9923..83660fa82c32 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -378,13 +378,47 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, return 0; } +/* + * Handle different locked folios: + * + * - Non-subpage folio + * Just unlock it. + * + * - folio locked but without any subpage locked + * This happens either before writepage_delalloc() or the delalloc range is + * already handled by previous folio. + * We can simple unlock it. + * + * - folio locked with subpage range locked. + * We go through the locked sectors inside the range and clear their locked + * bitmap, reduce the writer lock number, and unlock the page if that's + * the last locked range. + */ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { + struct btrfs_subpage *subpage = folio_get_private(folio); + + ASSERT(folio_test_locked(folio)); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { folio_unlock(folio); return; } + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) folio_unlock(folio); @@ -702,53 +736,6 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -/* - * Handle different locked pages with different page sizes: - * - * - Page locked by plain lock_page() - * It should not have any subpage::writers count. - * Can be unlocked by unlock_page(). - * This is the most common locked page for extent_writepage() called - * inside extent_write_cache_pages(). - * Rarer cases include the @locked_page from extent_write_locked_range(). - * - * - Page locked by lock_delalloc_pages() - * There is only one caller, all pages except @locked_page for - * extent_write_locked_range(). - * In this case, we have to call subpage helper to handle the case. - */ -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len) -{ - struct btrfs_subpage *subpage; - - ASSERT(folio_test_locked(folio)); - /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, folio->mapping)) { - folio_unlock(folio); - return; - } - - ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); - - /* - * For subpage case, there are two types of locked page. With or - * without writers number. - * - * Since we own the page lock, no one else could touch subpage::writers - * and we are safe to do several atomic operations without spinlock. - */ - if (atomic_read(&subpage->writers) == 0) { - /* No writers, locked by plain lock_page() */ - folio_unlock(folio); - return; - } - - /* Have writers, use proper subpage helper to end it */ - btrfs_folio_end_writer_lock(fs_info, folio, start, len); -} - /* * This is for folio already locked by plain lock_page()/folio_lock(), which * doesn't have any subpage awareness. diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f90e0c4f4cab..f805261e0999 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -155,8 +155,6 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); -void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, - struct folio *folio, u64 start, u32 len); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); From bd610c0937aaf03b2835638ada1fab8b0524c61a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 2 Sep 2024 14:29:06 +0930 Subject: [PATCH 110/110] btrfs: only unlock the to-be-submitted ranges inside a folio [SUBPAGE COMPRESSION LIMITS] Currently inside writepage_delalloc(), if a delalloc range is going to be submitted asynchronously (inline or compression, the page dirty/writeback/unlock are all handled in at different time, not at the submission time), then we return 1 and extent_writepage() will skip the submission. This is fine if every sector matches page size, but if a sector is smaller than page size (aka, subpage case), then it can be very problematic, for example for the following 64K page: 0 16K 32K 48K 64K |/| |///////| |/| | | 4K 52K Where |/| is the dirty range we need to submit. In the above case, we need the following different handling for the 3 ranges: - [0, 4K) needs to be submitted for regular write A single sector cannot be compressed. - [16K, 32K) needs to be submitted for compressed write - [48K, 52K) needs to be submitted for regular write. Above, if we try to submit [16K, 32K) for compressed write, we will return 1 and immediately, and without submitting the remaining [48K, 52K) range. Furthermore, since extent_writepage() will exit without unlocking any sectors, the submitted range [0, 4K) will not have sector unlocked. That's the reason why for now subpage is only allowed for full page range. [ENHANCEMENT] - Introduce a submission bitmap at btrfs_bio_ctrl::submit_bitmap This records which sectors will be submitted by extent_writepage_io(). This allows us to track which sectors needs to be submitted thus later to be properly unlocked. For asynchronously submitted range (inline/compression), the corresponding bits will be cleared from that bitmap. - Only return 1 if no sector needs to be submitted in writepage_delalloc() - Only submit sectors marked by submission bitmap inside extent_writepage_io() So we won't touch the asynchronously submitted part. - Introduce btrfs_folio_end_writer_lock_bitmap() helper This will only unlock the involved sectors specified by @bitmap parameter, to avoid touching the range asynchronously submitted. Please note that, since subpage compression is still limited to page aligned range, this change is only a preparation for future sector perfect compression support for subpage. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 89 +++++++++++++++++++++++++------------------- fs/btrfs/subpage.c | 33 ++++++++++++++++ fs/btrfs/subpage.h | 2 + 3 files changed, 86 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 70be1150c34e..39c9677c47d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -101,6 +101,13 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; + + /* + * The sectors of the page which are going to be submitted by + * extent_writepage_io(). + * This is to avoid touching ranges covered by compression/inline. + */ + unsigned long submit_bitmap; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -1106,9 +1113,10 @@ int btrfs_read_folio(struct file *file, struct folio *folio) */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct folio *folio, - struct writeback_control *wbc) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); + struct writeback_control *wbc = bio_ctrl->wbc; const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; @@ -1123,6 +1131,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 delalloc_to_write = 0; int ret = 0; + /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ + if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { + ASSERT(fs_info->sectors_per_page > 1); + btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); + } else { + bio_ctrl->submit_bitmap = 1; + } + /* Lock all (subpage) delalloc ranges inside the folio first. */ while (delalloc_start < page_end) { delalloc_end = page_end; @@ -1190,22 +1206,18 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* - * We can hit btrfs_run_delalloc_range() with >0 return value. - * - * This happens when either the IO is already done and folio - * unlocked (inline) or the IO submission and folio unlock would - * be handled as async (compression). - * - * Inline is only possible for regular sectorsize for now. - * - * Compression is possible for both subpage and regular cases, - * but even for subpage compression only happens for page aligned - * range, thus the found delalloc range must go beyond current - * folio. + * We have some ranges that's going to be submitted asynchronously + * (compression or inline). These range have their own control + * on when to unlock the pages. We should not touch them + * anymore, so clear the range from the submission bitmap. */ - if (ret > 0) - ASSERT(!is_subpage || found_start + found_len >= page_end); - + if (ret > 0) { + unsigned int start_bit = (found_start - page_start) >> + fs_info->sectorsize_bits; + unsigned int end_bit = (min(page_end + 1, found_start + found_len) - + page_start) >> fs_info->sectorsize_bits; + bitmap_clear(&bio_ctrl->submit_bitmap, start_bit, end_bit - start_bit); + } /* * Above btrfs_run_delalloc_range() may have unlocked the folio, * thus for the last range, we cannot touch the folio anymore. @@ -1230,10 +1242,10 @@ out: DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); /* - * If btrfs_run_dealloc_range() already started I/O and unlocked - * the folios, we just need to account for them here. + * If all ranges are submitted asynchronously, we just need to account + * for them here. */ - if (ret == 1) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1331,15 +1343,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long range_bitmap = 0; - /* - * This is the default value for sectorsize == PAGE_SIZE case. - * We known we need to write the dirty sector (aka the page), - * even if the page is not dirty (we cleared it before entering). - * - * For subpage cases we will get the correct bitmap later. - */ - unsigned long dirty_bitmap = 1; - unsigned int bitmap_size = 1; bool submitted_io = false; const u64 folio_start = folio_pos(folio); u64 cur; @@ -1357,18 +1360,14 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return 1; } - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); - btrfs_get_subpage_dirty_bitmap(fs_info, folio, &dirty_bitmap); - bitmap_size = fs_info->sectors_per_page; - } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); - bitmap_and(&dirty_bitmap, &dirty_bitmap, &range_bitmap, bitmap_size); + bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, + fs_info->sectors_per_page); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &dirty_bitmap, bitmap_size) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1421,6 +1420,7 @@ out: static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = folio->mapping->host; + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); const u64 page_start = folio_pos(folio); int ret; size_t pg_offset; @@ -1442,11 +1442,16 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl if (folio->index == end_index) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); + /* + * Default to unlock the whole folio. + * The proper bitmap can only be initialized until writepage_delalloc(). + */ + bio_ctrl->submit_bitmap = (unsigned long)-1; ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; - ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl->wbc); + ret = writepage_delalloc(BTRFS_I(inode), folio, bio_ctrl); if (ret == 1) return 0; if (ret) @@ -1466,8 +1471,11 @@ done: mapping_set_error(folio->mapping, ret); } - btrfs_folio_end_writer_lock(inode_to_fs_info(inode), folio, - page_start, PAGE_SIZE); + /* + * Only unlock ranges that are submitted. As there can be some async + * submitted ranges inside the folio. + */ + btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); ASSERT(ret <= 0); return ret; } @@ -2210,6 +2218,11 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); + /* + * Set the submission bitmap to submit all sectors. + * extent_writepage_io() will do the truncation correctly. + */ + bio_ctrl.submit_bitmap = (unsigned long)-1; ret = extent_writepage_io(BTRFS_I(inode), folio, cur, cur_len, &bio_ctrl, i_size); if (ret == 1) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 83660fa82c32..fe4d719d506b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -424,6 +424,39 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap) +{ + struct btrfs_subpage *subpage = folio_get_private(folio); + const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + unsigned long flags; + bool last = false; + int cleared = 0; + int bit; + + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + + if (atomic_read(&subpage->writers) == 0) { + /* No writers, locked by plain lock_page(). */ + folio_unlock(folio); + return; + } + + spin_lock_irqsave(&subpage->lock, flags); + for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) + cleared++; + } + ASSERT(atomic_read(&subpage->writers) >= cleared); + last = atomic_sub_and_test(cleared, &subpage->writers); + spin_unlock_irqrestore(&subpage->lock, flags); + if (last) + folio_unlock(folio); +} + #define subpage_test_bitmap_all_set(fs_info, subpage, name) \ bitmap_test_range_all_set(subpage->bitmaps, \ fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f805261e0999..4b85d91d0e18 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -106,6 +106,8 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, + struct folio *folio, unsigned long bitmap); bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 search_start, u64 *found_start_ret, u32 *found_len_ret);