From a3c3eecc7c876c7f2b09d9ee1acf5b8b85996cff Mon Sep 17 00:00:00 2001 From: Junchao Sun Date: Mon, 3 Jun 2024 21:15:24 +0800 Subject: [PATCH 01/93] ext4: adjust the layout of the ext4_inode_info structure to save memory Using pahole, we can see that there are some padding holes in the current ext4_inode_info structure. Adjusting the layout of ext4_inode_info can reduce these holes, resulting in the size of the structure decreasing from 2424 bytes to 2408 bytes. Signed-off-by: Junchao Sun Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240603131524.324224-1-sunjunchao2870@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08acd152261e..3d57acae0925 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1058,6 +1058,7 @@ struct ext4_inode_info { /* Number of ongoing updates on this inode */ atomic_t i_fc_updates; + atomic_t i_unwritten; /* Nr. of inflight conversions pending */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; @@ -1106,6 +1107,10 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; + + /* allocation reservation info for delalloc */ + /* In case of bigalloc, this refer to clusters rather than blocks */ + unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; @@ -1122,10 +1127,6 @@ struct ext4_inode_info { /* ialloc */ ext4_group_t i_last_alloc_group; - /* allocation reservation info for delalloc */ - /* In case of bigalloc, this refer to clusters rather than blocks */ - unsigned int i_reserved_data_blocks; - /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1149,7 +1150,6 @@ struct ext4_inode_info { */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ spinlock_t i_block_reservation_lock; From 985b67cd86392310d9e9326de941c22fc9340eec Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 5 Jun 2024 09:23:35 +0800 Subject: [PATCH 02/93] ext4: filesystems without casefold feature cannot be mounted with siphash When mounting the ext4 filesystem, if the default hash version is set to DX_HASH_SIPHASH but the casefold feature is not set, exit the mounting. Reported-by: syzbot+340581ba9dceb7e06fb3@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Link: https://patch.msgid.link/20240605012335.44086-1-lizhi.xu@windriver.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e72145c4ae5a..25cd0d662e31 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3582,6 +3582,13 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) "mounted without CONFIG_UNICODE"); return 0; } + if (EXT4_SB(sb)->s_es->s_def_hash_version == DX_HASH_SIPHASH && + !ext4_has_feature_casefold(sb)) { + ext4_msg(sb, KERN_ERR, + "Filesystem without casefold feature cannot be " + "mounted with siphash"); + return 0; + } if (readonly) return 1; From f67fbacd923f280c55b9e26e49650331446ef335 Mon Sep 17 00:00:00 2001 From: carrion bent Date: Thu, 6 Jun 2024 13:43:16 +0800 Subject: [PATCH 03/93] ext4: fix macro definition error of EXT4_DIRENT_HASH and EXT4_DIRENT_MINOR_HASH The macro parameter 'entry' of EXT4_DIRENT_HASH and EXT4_DIRENT_MINOR_HASH was not used, but rather the variable 'de' was directly used, which may be a local variable inside a function that calls the macros. Fortunately, all callers have passed in 'de' so far, so this bug didn't have an effect. Signed-off-by: carrion bent Link: https://patch.msgid.link/1717652596-58760-1-git-send-email-carrionbent@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3d57acae0925..5845e4aa091a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2338,9 +2338,9 @@ struct ext4_dir_entry_2 { ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) -#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ - le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { From 6ceeb2d8fdb19a1b6bb9cc48302e682fb380043b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 6 Jun 2024 20:55:06 +0800 Subject: [PATCH 04/93] ext4: correct comment of ext4_xattr_cmp The ext4_xattr_cmp never returns negative error number. Correct possible return value in ext4_xattr_cmp's comment. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240606125508.1459893-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 46ce2f21fef9..54928ba243ca 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -3065,8 +3065,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, * * Compare two extended attribute blocks for equality. * - * Returns 0 if the blocks are equal, 1 if they differ, and - * a negative error number on errors. + * Returns 0 if the blocks are equal, 1 if they differ. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, From 4b14737ce90424179d615cd35f04453f398f8324 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 6 Jun 2024 20:55:07 +0800 Subject: [PATCH 05/93] ext4: correct comment of ext4_xattr_block_cache_insert There is no return value from ext4_xattr_block_cache_insert, just correct it's comment about return value. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240606125508.1459893-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 54928ba243ca..31c5189e131f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -3036,8 +3036,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. - * - * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, From 5071010ac3aa32a1b0f0b4c14d3ea6b217ba21ba Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 6 Jun 2024 20:55:08 +0800 Subject: [PATCH 06/93] ext4: correct comment of h_checksum Checksum of xattr block is always crc32c(uuid+blknum+xattrblock), see ext4_xattr_block_csum_set for detail. Remove incorrect comment that "id = inum if refcount=1". Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240606125508.1459893-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index bd97c4aa8177..7df0d77643c7 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -32,8 +32,7 @@ struct ext4_xattr_header { __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ - __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ - /* id = inum if refcount=1, blknum otherwise */ + __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */ __u32 h_reserved[3]; /* zero right now */ }; From 70dd7b573afeba9b8f8a33f2ae1e4a9a2ec8c1ec Mon Sep 17 00:00:00 2001 From: "yao.ly" Date: Mon, 1 Jul 2024 14:43:39 +0800 Subject: [PATCH 07/93] ext4: correct encrypted dentry name hash when not casefolded EXT4_DIRENT_HASH and EXT4_DIRENT_MINOR_HASH will access struct ext4_dir_entry_hash followed ext4_dir_entry. But there is no ext4_dir_entry_hash followed when inode is encrypted and not casefolded Signed-off-by: yao.ly Link: https://patch.msgid.link/1719816219-128287-1-git-send-email-yao.ly@linux.alibaba.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/dir.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ff4514e4626b..b8b6b06015cd 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -279,12 +279,20 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct fscrypt_str de_name = FSTR_INIT(de->name, de->name_len); + u32 hash; + u32 minor_hash; + + if (IS_CASEFOLDED(inode)) { + hash = EXT4_DIRENT_HASH(de); + minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hash = 0; + minor_hash = 0; + } /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, - EXT4_DIRENT_HASH(de), - EXT4_DIRENT_MINOR_HASH(de), - &de_name, &fstr); + hash, minor_hash, &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) From 1a00a393d6a7fb1e745a41edd09019bd6a0ad64c Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Mon, 1 Jul 2024 22:25:03 +0800 Subject: [PATCH 08/93] ext4: no need to continue when the number of entries is 1 Fixes: ac27a0ec112a ("[PATCH] ext4: initial copy of files from ext3") Reported-by: syzbot+ae688d469e36fb5138d0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ae688d469e36fb5138d0 Signed-off-by: Edward Adam Davis Reported-and-tested-by: syzbot+ae688d469e36fb5138d0@syzkaller.appspotmail.com Link: https://patch.msgid.link/tencent_BE7AEE6C7C2D216CB8949CE8E6EE7ECC2C0A@qq.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6a95713f9193..1de1c1b460a7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1999,7 +1999,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, split = count/2; hash2 = map[split].hash; - continued = hash2 == map[split - 1].hash; + continued = split > 0 ? hash2 == map[split - 1].hash : 0; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); From bd8daa7717d94752ecd4a60b67a928d7159c2825 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 13 Jul 2024 17:10:24 +0200 Subject: [PATCH 09/93] ext4: use seq_putc() in two functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single characters (line breaks) should be put into a sequence. Thus use the corresponding function “seq_putc”. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Link: https://patch.msgid.link/076974ab-4da3-4176-89dc-0514e020c276@web.de Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- fs/ext4/super.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9dda9cd68ab2..2008d2d524c9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3075,8 +3075,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) seq_puts(seq, " Block bitmap corrupted!"); - seq_puts(seq, "\n"); - + seq_putc(seq, '\n'); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 25cd0d662e31..7806044ed816 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3045,7 +3045,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset) seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); - seq_puts(seq, "\n"); + seq_putc(seq, '\n'); return rc; } From 0ce160c5bdb67081a62293028dc85758a8efb22a Mon Sep 17 00:00:00 2001 From: Xiaxi Shen Date: Sun, 14 Jul 2024 21:33:36 -0700 Subject: [PATCH 10/93] ext4: fix timer use-after-free on failed mount Syzbot has found an ODEBUG bug in ext4_fill_super The del_timer_sync function cancels the s_err_report timer, which reminds about filesystem errors daily. We should guarantee the timer is no longer active before kfree(sbi). When filesystem mounting fails, the flow goes to failed_mount3, where an error occurs when ext4_stop_mmpd is called, causing a read I/O failure. This triggers the ext4_handle_error function that ultimately re-arms the timer, leaving the s_err_report timer active before kfree(sbi) is called. Fix the issue by canceling the s_err_report timer after calling ext4_stop_mmpd. Signed-off-by: Xiaxi Shen Reported-and-tested-by: syzbot+59e0101c430934bc9a36@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=59e0101c430934bc9a36 Link: https://patch.msgid.link/20240715043336.98097-1-shenxiaxi26@gmail.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7806044ed816..99ecd4ac7be0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5621,8 +5621,8 @@ failed_mount3a: failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); - del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); + del_timer_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) From 6db3c1575a750fd417a70e0178bdf6efa0dd5037 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Wed, 17 Jul 2024 18:22:20 +0100 Subject: [PATCH 11/93] ext4: fix fast commit inode enqueueing during a full journal commit When a full journal commit is on-going, any fast commit has to be enqueued into a different queue: FC_Q_STAGING instead of FC_Q_MAIN. This enqueueing is done only once, i.e. if an inode is already queued in a previous fast commit entry it won't be enqueued again. However, if a full commit starts _after_ the inode is enqueued into FC_Q_MAIN, the next fast commit needs to be done into FC_Q_STAGING. And this is not being done in function ext4_fc_track_template(). This patch fixes the issue by re-enqueuing an inode into the STAGING queue during the fast commit clean-up callback when doing a full commit. However, to prevent a race with a fast-commit, the clean-up callback has to be called with the journal locked. This bug was found using fstest generic/047. This test creates several 32k bytes files, sync'ing each of them after it's creation, and then shutting down the filesystem. Some data may be loss in this operation; for example a file may have it's size truncated to zero. Suggested-by: Jan Kara Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240717172220.14201-1-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 15 ++++++++++++++- fs/jbd2/journal.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 3926a05eceee..df71fd5b1fed 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1288,8 +1288,21 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) + if (tid_geq(tid, iter->i_sync_tid)) { ext4_fc_reset_inode(&iter->vfs_inode); + } else if (full) { + /* + * We are called after a full commit, inode has been + * modified while the commit was running. Re-enqueue + * the inode into STAGING, which will then be splice + * back into MAIN. This cannot happen during + * fastcommit because the journal is locked all the + * time in that case (and tid doesn't increase so + * tid check above isn't reliable). + */ + list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + &sbi->s_fc_q[FC_Q_STAGING]); + } /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1ebf2393bfb7..291a431f8aaf 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -740,9 +740,9 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); */ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { - jbd2_journal_unlock_updates(journal); if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); + jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) From 23dfdb56581ad92a9967bcd720c8c23356af74c1 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Thu, 18 Jul 2024 10:43:56 +0100 Subject: [PATCH 12/93] ext4: fix access to uninitialised lock in fc replay path The following kernel trace can be triggered with fstest generic/629 when executed against a filesystem with fast-commit feature enabled: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 0 PID: 866 Comm: mount Not tainted 6.10.0+ #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x66/0x90 register_lock_class+0x759/0x7d0 __lock_acquire+0x85/0x2630 ? __find_get_block+0xb4/0x380 lock_acquire+0xd1/0x2d0 ? __ext4_journal_get_write_access+0xd5/0x160 _raw_spin_lock+0x33/0x40 ? __ext4_journal_get_write_access+0xd5/0x160 __ext4_journal_get_write_access+0xd5/0x160 ext4_reserve_inode_write+0x61/0xb0 __ext4_mark_inode_dirty+0x79/0x270 ? ext4_ext_replay_set_iblocks+0x2f8/0x450 ext4_ext_replay_set_iblocks+0x330/0x450 ext4_fc_replay+0x14c8/0x1540 ? jread+0x88/0x2e0 ? rcu_is_watching+0x11/0x40 do_one_pass+0x447/0xd00 jbd2_journal_recover+0x139/0x1b0 jbd2_journal_load+0x96/0x390 ext4_load_and_init_journal+0x253/0xd40 ext4_fill_super+0x2cc6/0x3180 ... In the replay path there's an attempt to lock sbi->s_bdev_wb_lock in function ext4_check_bdev_write_error(). Unfortunately, at this point this spinlock has not been initialized yet. Moving it's initialization to an earlier point in __ext4_fill_super() fixes this splat. Signed-off-by: Luis Henriques (SUSE) Link: https://patch.msgid.link/20240718094356.7863-1-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 99ecd4ac7be0..89d9b34d787a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5320,6 +5320,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5541,7 +5543,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ - spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; From f5cacdc6f2bb2a9bf214469dd7112b43dd2dd68a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 18 Jul 2024 19:53:36 +0800 Subject: [PATCH 13/93] jbd2: stop waiting for space when jbd2_cleanup_journal_tail() returns error In __jbd2_log_wait_for_space(), we might call jbd2_cleanup_journal_tail() to recover some journal space. But if an error occurs while executing jbd2_cleanup_journal_tail() (e.g., an EIO), we don't stop waiting for free space right away, we try other branches, and if j_committing_transaction is NULL (i.e., the tid is 0), we will get the following complain: ============================================ JBD2: I/O error when updating journal superblock for sdd-8. __jbd2_log_wait_for_space: needed 256 blocks and only had 217 space available __jbd2_log_wait_for_space: no way to get more journal space in sdd-8 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 139804 at fs/jbd2/checkpoint.c:109 __jbd2_log_wait_for_space+0x251/0x2e0 Modules linked in: CPU: 2 PID: 139804 Comm: kworker/u8:3 Not tainted 6.6.0+ #1 RIP: 0010:__jbd2_log_wait_for_space+0x251/0x2e0 Call Trace: add_transaction_credits+0x5d1/0x5e0 start_this_handle+0x1ef/0x6a0 jbd2__journal_start+0x18b/0x340 ext4_dirty_inode+0x5d/0xb0 __mark_inode_dirty+0xe4/0x5d0 generic_update_time+0x60/0x70 [...] ============================================ So only if jbd2_cleanup_journal_tail() returns 1, i.e., there is nothing to clean up at the moment, continue to try to reclaim free space in other ways. Note that this fix relies on commit 6f6a6fda2945 ("jbd2: fix ocfs2 corrupt when updating journal superblock fails") to make jbd2_cleanup_journal_tail return the correct error code. Fixes: 8c3f25d8950c ("jbd2: don't give up looking for space so easily in __jbd2_log_wait_for_space") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240718115336.2554501-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 951f78634adf..7b593591273a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -86,8 +86,11 @@ __releases(&journal->j_state_lock) write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); - } else if (jbd2_cleanup_journal_tail(journal) == 0) { - /* We were able to recover space; yay! */ + } else if (jbd2_cleanup_journal_tail(journal) <= 0) { + /* + * We were able to recover space or the + * journal was aborted due to an error. + */ ; } else if (tid) { /* From e37c9e173bff50a2d57dfecdd694457c00ce5a8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 18 Jul 2024 23:29:59 +0100 Subject: [PATCH 14/93] ext4: reduce stack usage in ext4_mpage_readpages() This function is very similar to do_mpage_readpage() and a similar approach to that taken in commit 12ac5a65cb56 will work. As in do_mpage_readpage(), we only use this array for checking block contiguity and we can do that more efficiently with a little arithmetic. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20240718223005.568869-1-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 8494492582ab..5d3a9dc9a32d 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -221,7 +221,7 @@ int ext4_mpage_readpages(struct inode *inode, sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; - sector_t blocks[MAX_BUF_PER_PAGE]; + sector_t first_block; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; @@ -263,6 +263,7 @@ int ext4_mpage_readpages(struct inode *inode, unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; + first_block = map.m_pblk + map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ @@ -271,8 +272,6 @@ int ext4_mpage_readpages(struct inode *inode, } if (page_block == blocks_per_page) break; - blocks[page_block] = map.m_pblk + map_offset + - relative_block; page_block++; block_in_file++; } @@ -307,7 +306,9 @@ int ext4_mpage_readpages(struct inode *inode, goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ - if (page_block && blocks[page_block-1] != map.m_pblk-1) + if (!page_block) + first_block = map.m_pblk; + else if (first_block + page_block != map.m_pblk) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { @@ -316,7 +317,6 @@ int ext4_mpage_readpages(struct inode *inode, break; } else if (page_block == blocks_per_page) break; - blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } @@ -339,7 +339,7 @@ int ext4_mpage_readpages(struct inode *inode, * This folio will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != blocks[0] - 1 || + if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); @@ -355,7 +355,7 @@ int ext4_mpage_readpages(struct inode *inode, fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); - bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; @@ -371,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, submit_bio(bio); bio = NULL; } else - last_block_in_bio = blocks[blocks_per_page - 1]; + last_block_in_bio = first_block + blocks_per_page - 1; continue; confused: if (bio) { From 368a83cebbb949adbcc20877c35367178497d9cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 18 Jul 2024 23:30:00 +0100 Subject: [PATCH 15/93] ext4: pipeline buffer reads in mext_page_mkuptodate() Instead of synchronously reading one buffer at a time, submit reads as we walk the buffers in the first loop, then wait for them in the second loop. This should be significantly more efficient, particularly on HDDs, but I have not measured. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20240718223005.568869-2-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 204f53b23622..6d651ad788ac 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -174,7 +174,9 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; - int i, err, nr = 0, partial = 0; + int i, nr = 0; + bool partial = false; + BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -192,13 +194,13 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) - partial = 1; + partial = true; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { - err = ext4_get_block(inode, block, bh, 0); + int err = ext4_get_block(inode, block, bh, 0); if (err) return err; if (!buffer_mapped(bh)) { @@ -207,6 +209,12 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) continue; } } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + ext4_read_bh_nowait(bh, 0, NULL); BUG_ON(nr >= MAX_BUF_PER_PAGE); arr[nr++] = bh; } @@ -216,11 +224,10 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) for (i = 0; i < nr; i++) { bh = arr[i]; - if (!bh_uptodate_or_lock(bh)) { - err = ext4_read_bh(bh, 0, NULL); - if (err) - return err; - } + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + continue; + return -EIO; } out: if (!partial) From a40759fb16ae839f8c769174fde017564ea564ff Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 18 Jul 2024 23:30:01 +0100 Subject: [PATCH 16/93] ext4: remove array of buffer_heads from mext_page_mkuptodate() Iterate the folio's list of buffer_heads twice instead of keeping an array of pointers. This solves a too-large-array-for-stack problem on architectures with a ridiculoously large PAGE_SIZE and prepares ext4 to support larger folios. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20240718223005.568869-3-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 6d651ad788ac..660bf34a5c4b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -166,15 +166,14 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, return 0; } -/* Force page buffers uptodate w/o dropping page's lock */ -static int -mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) +/* Force folio buffers uptodate w/o dropping folio's lock */ +static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) { struct inode *inode = folio->mapping->host; sector_t block; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + struct buffer_head *bh, *head; unsigned int blocksize, block_start, block_end; - int i, nr = 0; + int nr = 0; bool partial = false; BUG_ON(!folio_test_locked(folio)); @@ -215,20 +214,23 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) continue; } ext4_read_bh_nowait(bh, 0, NULL); - BUG_ON(nr >= MAX_BUF_PER_PAGE); - arr[nr++] = bh; + nr++; } /* No io required */ if (!nr) goto out; - for (i = 0; i < nr; i++) { - bh = arr[i]; + bh = head; + do { + if (bh_offset(bh) + blocksize <= from) + continue; + if (bh_offset(bh) > to) + break; wait_on_buffer(bh); if (buffer_uptodate(bh)) continue; return -EIO; - } + } while ((bh = bh->b_this_page) != head); out: if (!partial) folio_mark_uptodate(folio); From 3e3a693551c3e9b45575e94ca2d1d670a47b3fcc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 18 Jul 2024 23:30:02 +0100 Subject: [PATCH 17/93] ext4: tidy the BH loop in mext_page_mkuptodate() This for loop is somewhat hard to read; turn it into a normal BH do-while loop. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20240718223005.568869-4-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 660bf34a5c4b..516897b0218e 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -187,9 +187,11 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) if (!head) head = create_empty_buffers(folio, blocksize, 0); - block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); - for (bh = head, block_start = 0; bh != head || !block_start; - block++, block_start = block_end, bh = bh->b_this_page) { + block = folio_pos(folio) >> inode->i_blkbits; + block_end = 0; + bh = head; + do { + block_start = block_end; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -215,7 +217,8 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) } ext4_read_bh_nowait(bh, 0, NULL); nr++; - } + } while (block++, (bh = bh->b_this_page) != head); + /* No io required */ if (!nr) goto out; From 7e8fb2eda9885ea2d13179a4c0bbf810f900ef25 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 22 Jul 2024 22:16:47 -0700 Subject: [PATCH 18/93] jbd2: fix kernel-doc for j_transaction_overhead_buffers Use the correct struct member name in the kernel-doc notation to prevent a kernel-doc build warning. include/linux/jbd2.h:1303: warning: Function parameter or struct member 'j_transaction_overhead_buffers' not described in 'journal_s' include/linux/jbd2.h:1303: warning: Excess struct member 'j_transaction_overhead' description in 'journal_s' Fixes: e3a00a23781c ("jbd2: precompute number of transaction descriptor blocks") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20240710182252.4c281445@canb.auug.org.au/ Signed-off-by: Randy Dunlap Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240723051647.3053491-1-rdunlap@infradead.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5157d92b6f23..17662eae408f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1086,7 +1086,7 @@ struct journal_s int j_revoke_records_per_block; /** - * @j_transaction_overhead: + * @j_transaction_overhead_buffers: * * Number of blocks each transaction needs for its own bookkeeping */ From dd589b0f1445e1ea1085b98edca6e4d5dedb98d0 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Wed, 24 Jul 2024 17:11:15 +0100 Subject: [PATCH 19/93] ext4: fix incorrect tid assumption in ext4_wait_for_tail_page_commit() Function ext4_wait_for_tail_page_commit() assumes that '0' is not a valid value for transaction IDs, which is incorrect. Don't assume that and invoke jbd2_log_wait_commit() if the journal had a committing transaction instead. Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240724161119.13448-2-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 941c1c0d5c6e..a0fa5192db8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5279,8 +5279,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = 0; + tid_t commit_tid; int ret; + bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* @@ -5305,12 +5306,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) folio_put(folio); if (ret != -EBUSY) return; - commit_tid = 0; + has_transaction = false; read_lock(&journal->j_state_lock); - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } read_unlock(&journal->j_state_lock); - if (commit_tid) + if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } From 972090651ee15e51abfb2160e986fa050cfc7a40 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Wed, 24 Jul 2024 17:11:16 +0100 Subject: [PATCH 20/93] ext4: fix incorrect tid assumption in __jbd2_log_wait_for_space() Function __jbd2_log_wait_for_space() assumes that '0' is not a valid value for transaction IDs, which is incorrect. Don't assume that and invoke jbd2_log_wait_commit() if the journal had a committing transaction instead. Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240724161119.13448-3-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/checkpoint.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 7b593591273a..bd34083d5fa2 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -79,9 +79,12 @@ __releases(&journal->j_state_lock) if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; + bool has_transaction = false; - if (journal->j_committing_transaction) + if (journal->j_committing_transaction) { tid = journal->j_committing_transaction->t_tid; + has_transaction = true; + } spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); if (chkpt) { @@ -92,7 +95,7 @@ __releases(&journal->j_state_lock) * journal was aborted due to an error. */ ; - } else if (tid) { + } else if (has_transaction) { /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED From 7a6443e1dad70281f99f0bd394d7fd342481a632 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Wed, 24 Jul 2024 17:11:17 +0100 Subject: [PATCH 21/93] ext4: fix incorrect tid assumption in jbd2_journal_shrink_checkpoint_list() Function jbd2_journal_shrink_checkpoint_list() assumes that '0' is not a valid value for transaction IDs, which is incorrect. Don't assume that and use two extra boolean variables to control the loop iterations and keep track of the first and last tid. Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240724161119.13448-4-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/checkpoint.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index bd34083d5fa2..b3971e91e8eb 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -413,6 +413,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, tid_t tid = 0; unsigned long nr_freed = 0; unsigned long freed; + bool first_set = false; again: spin_lock(&journal->j_list_lock); @@ -432,8 +433,10 @@ again: else transaction = journal->j_checkpoint_transactions; - if (!first_tid) + if (!first_set) { first_tid = transaction->t_tid; + first_set = true; + } last_transaction = journal->j_checkpoint_transactions->t_cpprev; next_transaction = transaction; last_tid = last_transaction->t_tid; @@ -463,7 +466,7 @@ again: spin_unlock(&journal->j_list_lock); cond_resched(); - if (*nr_to_scan && next_tid) + if (*nr_to_scan && journal->j_shrink_transaction) goto again; out: trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, From ebc4b2c1ac92fc0f8bf3f5a9c285a871d5084a6b Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Wed, 24 Jul 2024 17:11:18 +0100 Subject: [PATCH 22/93] ext4: fix incorrect tid assumption in ext4_fc_mark_ineligible() Function jbd2_journal_shrink_checkpoint_list() assumes that '0' is not a valid value for transaction IDs, which is incorrect. Furthermore, the sbi->s_fc_ineligible_tid handling also makes the same assumption by being initialised to '0'. Fortunately, the sb flag EXT4_MF_FC_INELIGIBLE can be used to check whether sbi->s_fc_ineligible_tid has been previously set instead of comparing it with '0'. Signed-off-by: Luis Henriques (SUSE) Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240724161119.13448-5-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index df71fd5b1fed..326c16a4e51e 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -339,22 +339,29 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; + bool has_transaction = true; + bool is_ineligible; if (ext4_fc_disabled(sb)) return; - ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); - tid = sbi->s_journal->j_running_transaction ? - sbi->s_journal->j_running_transaction->t_tid : 0; + if (sbi->s_journal->j_running_transaction) + tid = sbi->s_journal->j_running_transaction->t_tid; + else + has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } spin_lock(&sbi->s_fc_lock); - if (tid_gt(tid, sbi->s_fc_ineligible_tid)) + is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (has_transaction && + (!is_ineligible || + (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid)))) sbi->s_fc_ineligible_tid = tid; + ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; From 736c24e62e7e1f5d00fb5b750bb7c13c56410d07 Mon Sep 17 00:00:00 2001 From: Stefan Tauner Date: Sun, 28 Jul 2024 02:34:33 +0200 Subject: [PATCH 23/93] Documentation: ext4.rst: remove obsolete descriptions of noacl/nouser_xattr options These have been deprecated for a decade[1] and removed two years ago[2]. 1: f70486055ee351158bd6999f3965ad378b52c694 2: 2d544ec923dbe5fbed64a7f43dccf527218380bc Signed-off-by: Stefan Tauner Link: https://patch.msgid.link/20240728003433.2566649-1-stefan.tauner@gmx.at Signed-off-by: Theodore Ts'o --- Documentation/admin-guide/ext4.rst | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst index 5740d85439ff..2418b0c2d3df 100644 --- a/Documentation/admin-guide/ext4.rst +++ b/Documentation/admin-guide/ext4.rst @@ -212,16 +212,6 @@ When mounting an ext4 filesystem, the following option are accepted: that ext4's inode table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. - nouser_xattr - Disables Extended User Attributes. See the attr(5) manual page for - more information about extended attributes. - - noacl - This option disables POSIX Access Control List support. If ACL support - is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL - is enabled by default on mount. See the acl(5) manual page for more - information about acl. - bsddf (*) Make 'df' act like BSD. From 01cdf03b1378f2d860d4eb5951895a92002226a3 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 31 Jul 2024 00:02:02 +0200 Subject: [PATCH 24/93] ext4: annotate struct ext4_xattr_inode_array with __counted_by() Add the __counted_by compiler attribute to the flexible array member inodes to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Remove the now obsolete comment on the count field. In ext4_expand_inode_array(), use struct_size() instead of offsetof() and remove the local variable count. Increment the count field before adding a new inode to the inodes array. Compile-tested only. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20240730220200.410939-3-thorsten.blum@toblux.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 22 ++++++++++------------ fs/ext4/xattr.h | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 31c5189e131f..c14a674c3e3d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2879,33 +2879,31 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. - * If *ea_inode_array is NULL, this is essentially offsetof() */ - (*ea_inode_array) = - kmalloc(offsetof(struct ext4_xattr_inode_array, - inodes[EIA_MASK]), - GFP_NOFS); + (*ea_inode_array) = kmalloc( + struct_size(*ea_inode_array, inodes, EIA_MASK), + GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; - int count = (*ea_inode_array)->count; - /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( - offsetof(struct ext4_xattr_inode_array, - inodes[count + EIA_INCR]), - GFP_NOFS); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count + EIA_INCR), + GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, - offsetof(struct ext4_xattr_inode_array, inodes[count])); + struct_size(*ea_inode_array, inodes, + (*ea_inode_array)->count)); kfree(*ea_inode_array); *ea_inode_array = new_array; } - (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; + (*ea_inode_array)->count++; + (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; return 0; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 7df0d77643c7..b25c2d7b5f99 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -129,8 +129,8 @@ struct ext4_xattr_ibody_find { }; struct ext4_xattr_inode_array { - unsigned int count; /* # of used items in the array */ - struct inode *inodes[]; + unsigned int count; + struct inode *inodes[] __counted_by(count); }; extern const struct xattr_handler ext4_xattr_user_handler; From f0e3c14802515f60a47e6ef347ea59c2733402aa Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:08 +0800 Subject: [PATCH 25/93] jbd2: correctly compare tids with tid_geq function in jbd2_fc_begin_commit Use tid_geq to compare tids to work over sequence number wraps. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Cc: stable@kernel.org Link: https://patch.msgid.link/20240801013815.2393869-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 291a431f8aaf..33a975193f1f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -710,7 +710,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) return -EINVAL; write_lock(&journal->j_state_lock); - if (tid <= journal->j_commit_sequence) { + if (tid_geq(journal->j_commit_sequence, tid)) { write_unlock(&journal->j_state_lock); return -EALREADY; } From f2917bda8a5c97be41c34e7761863a724097cf91 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:09 +0800 Subject: [PATCH 26/93] jbd2: remove dead check in journal_alloc_journal_head We will alloc journal_head with __GFP_NOFAIL anyway, test for failure is pointless. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 33a975193f1f..326373ab022a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2866,8 +2866,7 @@ static struct journal_head *journal_alloc_journal_head(void) ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); } - if (ret) - spin_lock_init(&ret->b_state_lock); + spin_lock_init(&ret->b_state_lock); return ret; } From fa10db138d205362026daecc8ea65d4e339ade00 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:10 +0800 Subject: [PATCH 27/93] jbd2: remove unused return value of jbd2_fc_release_bufs Remove unused return value of jbd2_fc_release_bufs. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 +--- include/linux/jbd2.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 326373ab022a..51f184ba830c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -903,7 +903,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) } EXPORT_SYMBOL(jbd2_fc_wait_bufs); -int jbd2_fc_release_bufs(journal_t *journal) +void jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; int i, j_fc_off; @@ -917,8 +917,6 @@ int jbd2_fc_release_bufs(journal_t *journal) put_bh(bh); journal->j_fc_wbuf[i] = NULL; } - - return 0; } EXPORT_SYMBOL(jbd2_fc_release_bufs); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 17662eae408f..8aef9bb6ad57 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1675,7 +1675,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); -int jbd2_fc_release_bufs(journal_t *journal); +void jbd2_fc_release_bufs(journal_t *journal); /* * is_journal_abort From debbfd991f0116a8e88b77f08d231c57a1207dec Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:11 +0800 Subject: [PATCH 28/93] jbd2: remove unneeded kmap for jh_in->b_frozen_data in jbd2_journal_write_metadata_buffer Remove kmap for page of b_frozen_data from jbd2_alloc() which always provides an address from the direct kernel mapping. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 51f184ba830c..f7464ac29cd5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -352,12 +352,13 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, done_copy_out = 1; new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + mapped_data = jh_in->b_frozen_data; } else { new_folio = bh_in->b_folio; new_offset = offset_in_folio(new_folio, bh_in->b_data); + mapped_data = kmap_local_folio(new_folio, new_offset); } - mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic @@ -373,7 +374,8 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) do_escape = 1; - kunmap_local(mapped_data); + if (!jh_in->b_frozen_data) + kunmap_local(mapped_data); /* * Do we need to do a data copy? From 7c48e7d5a195fc918f9ab281390a5ba4b3e18022 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:12 +0800 Subject: [PATCH 29/93] jbd2: remove unneeded done_copy_out variable in jbd2_journal_write_metadata_buffer It's more intuitive to use jh_in->b_frozen_data directly instead of done_copy_out variable. Simply remove unneeded done_copy_out variable and use b_frozen_data instead. Signed-off-by: Kemeng Shi Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240801013815.2393869-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f7464ac29cd5..225a9a123335 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -318,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct buffer_head **bh_out, sector_t blocknr) { - int done_copy_out = 0; int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; @@ -349,7 +348,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, * we use that version of the data for the commit. */ if (jh_in->b_frozen_data) { - done_copy_out = 1; new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); mapped_data = jh_in->b_frozen_data; @@ -357,17 +355,15 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, new_folio = bh_in->b_folio; new_offset = offset_in_folio(new_folio, bh_in->b_data); mapped_data = kmap_local_folio(new_folio, new_offset); - } - - /* - * Fire data frozen trigger if data already wasn't frozen. Do this - * before checking for escaping, as the trigger may modify the magic - * offset. If a copy-out happens afterwards, it will have the correct - * data in the buffer. - */ - if (!done_copy_out) + /* + * Fire data frozen trigger if data already wasn't frozen. Do + * this before checking for escaping, as the trigger may modify + * the magic offset. If a copy-out happens afterwards, it will + * have the correct data in the buffer. + */ jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); + } /* * Check for escaping @@ -380,7 +376,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, /* * Do we need to do a data copy? */ - if (do_escape && !done_copy_out) { + if (do_escape && !jh_in->b_frozen_data) { char *tmp; spin_unlock(&jh_in->b_state_lock); @@ -408,7 +404,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, copy_done: new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); - done_copy_out = 1; } /* From f47aa3ebe3f4ff5ca4b9abfdbb71d7b775bb0c44 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:13 +0800 Subject: [PATCH 30/93] jbd2: move escape handle to futher improve jbd2_journal_write_metadata_buffer Move escape handle to futher improve code readability and remove some repeat check. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 49 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 225a9a123335..2e61f001fc03 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -281,6 +281,16 @@ static void journal_kill_thread(journal_t *journal) write_unlock(&journal->j_state_lock); } +static inline bool jbd2_data_needs_escaping(char *data) +{ + return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER); +} + +static inline void jbd2_data_do_escape(char *data) +{ + *((unsigned int *)data) = 0; +} + /* * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. * @@ -319,7 +329,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, sector_t blocknr) { int do_escape = 0; - char *mapped_data; struct buffer_head *new_bh; struct folio *new_folio; unsigned int new_offset; @@ -350,8 +359,13 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, if (jh_in->b_frozen_data) { new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); - mapped_data = jh_in->b_frozen_data; + do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data); + if (do_escape) + jbd2_data_do_escape(jh_in->b_frozen_data); } else { + char *tmp; + char *mapped_data; + new_folio = bh_in->b_folio; new_offset = offset_in_folio(new_folio, bh_in->b_data); mapped_data = kmap_local_folio(new_folio, new_offset); @@ -363,21 +377,13 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); - } - - /* - * Check for escaping - */ - if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) - do_escape = 1; - if (!jh_in->b_frozen_data) + do_escape = jbd2_data_needs_escaping(mapped_data); kunmap_local(mapped_data); - - /* - * Do we need to do a data copy? - */ - if (do_escape && !jh_in->b_frozen_data) { - char *tmp; + /* + * Do we need to do a data copy? + */ + if (!do_escape) + goto escape_done; spin_unlock(&jh_in->b_state_lock); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); @@ -404,17 +410,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, copy_done: new_folio = virt_to_folio(jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); + jbd2_data_do_escape(jh_in->b_frozen_data); } - /* - * Did we need to do an escaping? Now we've done all the - * copying, we can finally do so. - * b_frozen_data is from jbd2_alloc() which always provides an - * address from the direct kernels mapping. - */ - if (do_escape) - *((unsigned int *)jh_in->b_frozen_data) = 0; - +escape_done: folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; From 1862304b062acb15e05b4e51270dc92de4b7635b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:14 +0800 Subject: [PATCH 31/93] jbd2: correct comment jbd2_mark_journal_empty After jbd2_mark_journal_empty, journal log is supposed to be empty. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e61f001fc03..7b78c0afa09f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1938,7 +1938,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) if (had_fast_commit) jbd2_set_feature_fast_commit(journal); - /* Log is no longer empty */ + /* Log is empty */ write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_FLUSHED; write_unlock(&journal->j_state_lock); From 6140ceb9b224fd178f405a7805d3fd82d2d02c39 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:15 +0800 Subject: [PATCH 32/93] jbd2: remove unneeded check of ret in jbd2_fc_get_buf Simply return -EINVAL if j_fc_off is invalid to avoid repeated check of ret. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 7b78c0afa09f..97f487c3d8fc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -837,17 +837,12 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) *bh_out = NULL; - if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { - fc_off = journal->j_fc_off; - blocknr = journal->j_fc_first + fc_off; - journal->j_fc_off++; - } else { - ret = -EINVAL; - } - - if (ret) - return ret; + if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last) + return -EINVAL; + fc_off = journal->j_fc_off; + blocknr = journal->j_fc_first + fc_off; + journal->j_fc_off++; ret = jbd2_journal_bmap(journal, blocknr, &pblock); if (ret) return ret; @@ -856,7 +851,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out) if (!bh) return -ENOMEM; - journal->j_fc_wbuf[fc_off] = bh; *bh_out = bh; From d1bc560e9a9c78d0b2314692847fc8661e0aeb99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20G=C5=82adysz?= Date: Thu, 1 Aug 2024 16:38:27 +0200 Subject: [PATCH 33/93] ext4: nested locking for xattr inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add nested locking with I_MUTEX_XATTR subclass to avoid lockdep warning while handling xattr inode on file open syscall at ext4_xattr_inode_iget. Backtrace EXT4-fs (loop0): Ignoring removed oldalloc option ====================================================== WARNING: possible circular locking dependency detected 5.10.0-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor543/2794 is trying to acquire lock: ffff8880215e1a48 (&ea_inode->i_rwsem#7/1){+.+.}-{3:3}, at: inode_lock include/linux/fs.h:782 [inline] ffff8880215e1a48 (&ea_inode->i_rwsem#7/1){+.+.}-{3:3}, at: ext4_xattr_inode_iget+0x42a/0x5c0 fs/ext4/xattr.c:425 but task is already holding lock: ffff8880215e3278 (&ei->i_data_sem/3){++++}-{3:3}, at: ext4_setattr+0x136d/0x19c0 fs/ext4/inode.c:5559 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&ei->i_data_sem/3){++++}-{3:3}: lock_acquire+0x197/0x480 kernel/locking/lockdep.c:5566 down_write+0x93/0x180 kernel/locking/rwsem.c:1564 ext4_update_i_disksize fs/ext4/ext4.h:3267 [inline] ext4_xattr_inode_write fs/ext4/xattr.c:1390 [inline] ext4_xattr_inode_lookup_create fs/ext4/xattr.c:1538 [inline] ext4_xattr_set_entry+0x331a/0x3d80 fs/ext4/xattr.c:1662 ext4_xattr_ibody_set+0x124/0x390 fs/ext4/xattr.c:2228 ext4_xattr_set_handle+0xc27/0x14e0 fs/ext4/xattr.c:2385 ext4_xattr_set+0x219/0x390 fs/ext4/xattr.c:2498 ext4_xattr_user_set+0xc9/0xf0 fs/ext4/xattr_user.c:40 __vfs_setxattr+0x404/0x450 fs/xattr.c:177 __vfs_setxattr_noperm+0x11d/0x4f0 fs/xattr.c:208 __vfs_setxattr_locked+0x1f9/0x210 fs/xattr.c:266 vfs_setxattr+0x112/0x2c0 fs/xattr.c:283 setxattr+0x1db/0x3e0 fs/xattr.c:548 path_setxattr+0x15a/0x240 fs/xattr.c:567 __do_sys_setxattr fs/xattr.c:582 [inline] __se_sys_setxattr fs/xattr.c:578 [inline] __x64_sys_setxattr+0xc5/0xe0 fs/xattr.c:578 do_syscall_64+0x6d/0xa0 arch/x86/entry/common.c:62 entry_SYSCALL_64_after_hwframe+0x61/0xcb -> #0 (&ea_inode->i_rwsem#7/1){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:2988 [inline] check_prevs_add kernel/locking/lockdep.c:3113 [inline] validate_chain+0x1695/0x58f0 kernel/locking/lockdep.c:3729 __lock_acquire+0x12fd/0x20d0 kernel/locking/lockdep.c:4955 lock_acquire+0x197/0x480 kernel/locking/lockdep.c:5566 down_write+0x93/0x180 kernel/locking/rwsem.c:1564 inode_lock include/linux/fs.h:782 [inline] ext4_xattr_inode_iget+0x42a/0x5c0 fs/ext4/xattr.c:425 ext4_xattr_inode_get+0x138/0x410 fs/ext4/xattr.c:485 ext4_xattr_move_to_block fs/ext4/xattr.c:2580 [inline] ext4_xattr_make_inode_space fs/ext4/xattr.c:2682 [inline] ext4_expand_extra_isize_ea+0xe70/0x1bb0 fs/ext4/xattr.c:2774 __ext4_expand_extra_isize+0x304/0x3f0 fs/ext4/inode.c:5898 ext4_try_to_expand_extra_isize fs/ext4/inode.c:5941 [inline] __ext4_mark_inode_dirty+0x591/0x810 fs/ext4/inode.c:6018 ext4_setattr+0x1400/0x19c0 fs/ext4/inode.c:5562 notify_change+0xbb6/0xe60 fs/attr.c:435 do_truncate+0x1de/0x2c0 fs/open.c:64 handle_truncate fs/namei.c:2970 [inline] do_open fs/namei.c:3311 [inline] path_openat+0x29f3/0x3290 fs/namei.c:3425 do_filp_open+0x20b/0x450 fs/namei.c:3452 do_sys_openat2+0x124/0x460 fs/open.c:1207 do_sys_open fs/open.c:1223 [inline] __do_sys_open fs/open.c:1231 [inline] __se_sys_open fs/open.c:1227 [inline] __x64_sys_open+0x221/0x270 fs/open.c:1227 do_syscall_64+0x6d/0xa0 arch/x86/entry/common.c:62 entry_SYSCALL_64_after_hwframe+0x61/0xcb other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&ei->i_data_sem/3); lock(&ea_inode->i_rwsem#7/1); lock(&ei->i_data_sem/3); lock(&ea_inode->i_rwsem#7/1); *** DEADLOCK *** 5 locks held by syz-executor543/2794: #0: ffff888026fbc448 (sb_writers#4){.+.+}-{0:0}, at: mnt_want_write+0x4a/0x2a0 fs/namespace.c:365 #1: ffff8880215e3488 (&sb->s_type->i_mutex_key#7){++++}-{3:3}, at: inode_lock include/linux/fs.h:782 [inline] #1: ffff8880215e3488 (&sb->s_type->i_mutex_key#7){++++}-{3:3}, at: do_truncate+0x1cf/0x2c0 fs/open.c:62 #2: ffff8880215e3310 (&ei->i_mmap_sem){++++}-{3:3}, at: ext4_setattr+0xec4/0x19c0 fs/ext4/inode.c:5519 #3: ffff8880215e3278 (&ei->i_data_sem/3){++++}-{3:3}, at: ext4_setattr+0x136d/0x19c0 fs/ext4/inode.c:5559 #4: ffff8880215e30c8 (&ei->xattr_sem){++++}-{3:3}, at: ext4_write_trylock_xattr fs/ext4/xattr.h:162 [inline] #4: ffff8880215e30c8 (&ei->xattr_sem){++++}-{3:3}, at: ext4_try_to_expand_extra_isize fs/ext4/inode.c:5938 [inline] #4: ffff8880215e30c8 (&ei->xattr_sem){++++}-{3:3}, at: __ext4_mark_inode_dirty+0x4fb/0x810 fs/ext4/inode.c:6018 stack backtrace: CPU: 1 PID: 2794 Comm: syz-executor543 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x177/0x211 lib/dump_stack.c:118 print_circular_bug+0x146/0x1b0 kernel/locking/lockdep.c:2002 check_noncircular+0x2cc/0x390 kernel/locking/lockdep.c:2123 check_prev_add kernel/locking/lockdep.c:2988 [inline] check_prevs_add kernel/locking/lockdep.c:3113 [inline] validate_chain+0x1695/0x58f0 kernel/locking/lockdep.c:3729 __lock_acquire+0x12fd/0x20d0 kernel/locking/lockdep.c:4955 lock_acquire+0x197/0x480 kernel/locking/lockdep.c:5566 down_write+0x93/0x180 kernel/locking/rwsem.c:1564 inode_lock include/linux/fs.h:782 [inline] ext4_xattr_inode_iget+0x42a/0x5c0 fs/ext4/xattr.c:425 ext4_xattr_inode_get+0x138/0x410 fs/ext4/xattr.c:485 ext4_xattr_move_to_block fs/ext4/xattr.c:2580 [inline] ext4_xattr_make_inode_space fs/ext4/xattr.c:2682 [inline] ext4_expand_extra_isize_ea+0xe70/0x1bb0 fs/ext4/xattr.c:2774 __ext4_expand_extra_isize+0x304/0x3f0 fs/ext4/inode.c:5898 ext4_try_to_expand_extra_isize fs/ext4/inode.c:5941 [inline] __ext4_mark_inode_dirty+0x591/0x810 fs/ext4/inode.c:6018 ext4_setattr+0x1400/0x19c0 fs/ext4/inode.c:5562 notify_change+0xbb6/0xe60 fs/attr.c:435 do_truncate+0x1de/0x2c0 fs/open.c:64 handle_truncate fs/namei.c:2970 [inline] do_open fs/namei.c:3311 [inline] path_openat+0x29f3/0x3290 fs/namei.c:3425 do_filp_open+0x20b/0x450 fs/namei.c:3452 do_sys_openat2+0x124/0x460 fs/open.c:1207 do_sys_open fs/open.c:1223 [inline] __do_sys_open fs/open.c:1231 [inline] __se_sys_open fs/open.c:1227 [inline] __x64_sys_open+0x221/0x270 fs/open.c:1227 do_syscall_64+0x6d/0xa0 arch/x86/entry/common.c:62 entry_SYSCALL_64_after_hwframe+0x61/0xcb RIP: 0033:0x7f0cde4ea229 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 21 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd81d1c978 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 0030656c69662f30 RCX: 00007f0cde4ea229 RDX: 0000000000000089 RSI: 00000000000a0a00 RDI: 00000000200001c0 RBP: 2f30656c69662f2e R08: 0000000000208000 R09: 0000000000208000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffd81d1c9c0 R13: 00007ffd81d1ca00 R14: 0000000000080000 R15: 0000000000000003 EXT4-fs error (device loop0): ext4_expand_extra_isize_ea:2730: inode #13: comm syz-executor543: corrupted in-inode xattr Signed-off-by: Wojciech Gładysz Link: https://patch.msgid.link/20240801143827.19135-1-wojciech.gladysz@infogain.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c14a674c3e3d..e0e1956dcdd3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -458,7 +458,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { - inode_lock(inode); + inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } @@ -1039,7 +1039,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, s64 ref_count; int ret; - inode_lock(ea_inode); + inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) From d3476f3dad4ad68ae5f6b008ea6591d1520da5d8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 5 Aug 2024 22:12:41 +0200 Subject: [PATCH 34/93] ext4: don't set SB_RDONLY after filesystem errors When the filesystem is mounted with errors=remount-ro, we were setting SB_RDONLY flag to stop all filesystem modifications. We knew this misses proper locking (sb->s_umount) and does not go through proper filesystem remount procedure but it has been the way this worked since early ext2 days and it was good enough for catastrophic situation damage mitigation. Recently, syzbot has found a way (see link) to trigger warnings in filesystem freezing because the code got confused by SB_RDONLY changing under its hands. Since these days we set EXT4_FLAGS_SHUTDOWN on the superblock which is enough to stop all filesystem modifications, modifying SB_RDONLY shouldn't be needed. So stop doing that. Link: https://lore.kernel.org/all/000000000000b90a8e061e21d12f@google.com Reported-by: Christian Brauner Signed-off-by: Jan Kara Reviewed-by: Christian Brauner Link: https://patch.msgid.link/20240805201241.27286-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 89d9b34d787a..58423e6bf3d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -735,11 +735,12 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * Make sure updated value of ->s_mount_flags will be visible before - * ->s_flags update + * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem + * modifications. We don't set SB_RDONLY because that requires + * sb->s_umount semaphore and setting it without proper remount + * procedure is confusing code such as freeze_super() leading to + * deadlocks and other problems. */ - smp_wmb(); - sb->s_flags |= SB_RDONLY; } static void update_super_work(struct work_struct *work) From dda898d7ffe85931f9cca6d702a51f33717c501e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 9 Aug 2024 20:15:32 +0800 Subject: [PATCH 35/93] ext4: dax: fix overflowing extents beyond inode size when partially writing The dax_iomap_rw() does two things in each iteration: map written blocks and copy user data to blocks. If the process is killed by user(See signal handling in dax_iomap_iter()), the copied data will be returned and added on inode size, which means that the length of written extents may exceed the inode size, then fsck will fail. An example is given as: dd if=/dev/urandom of=file bs=4M count=1 dax_iomap_rw iomap_iter // round 1 ext4_iomap_begin ext4_iomap_alloc // allocate 0~2M extents(written flag) dax_iomap_iter // copy 2M data iomap_iter // round 2 iomap_iter_advance iter->pos += iter->processed // iter->pos = 2M ext4_iomap_begin ext4_iomap_alloc // allocate 2~4M extents(written flag) dax_iomap_iter fatal_signal_pending done = iter->pos - iocb->ki_pos // done = 2M ext4_handle_inode_extension ext4_update_inode_size // inode size = 2M fsck reports: Inode 13, i_size is 2097152, should be 4194304. Fix? Fix the problem by truncating extents if the written length is smaller than expected. Fixes: 776722e85d3b ("ext4: DAX iomap write support") CC: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=219136 Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Reviewed-by: Zhihao Cheng Link: https://patch.msgid.link/20240809121532.2105494-1-chengzhihao@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c89e434db6b7..be061bb64067 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -334,10 +334,10 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ -static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) +static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) { lockdep_assert_held_write(&inode->i_rwsem); - if (count < 0) { + if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may @@ -586,7 +586,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); - ext4_inode_extension_cleanup(inode, ret); + ext4_inode_extension_cleanup(inode, ret < 0); } out: @@ -670,7 +670,7 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret); - ext4_inode_extension_cleanup(inode, ret); + ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: inode_unlock(inode); From 130078d020e0214809f2e13cf4fb80c646020e94 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:41 +0800 Subject: [PATCH 36/93] ext4: factor out ext4_map_create_blocks() to allocate new blocks Factor out a common helper ext4_map_create_blocks() from ext4_map_blocks() to do a real blocks allocation, no logic changes. [ Note: this first patch of a ten patch series named "v3: simplify the counting and management of delalloc reserved blocks". The link to the v1 and v2 patch series are below. -- TYT ] Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240802115120.362902-1-yi.zhang@huaweicloud.com # v2 of patch series Link: https://patch.msgid.link/20240601034149.2169771-1-yi.zhang@huaweicloud.com # v1 of the patch series Link: https://patch.msgid.link/20240813123452.2824659-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 157 +++++++++++++++++++++++++----------------------- 1 file changed, 81 insertions(+), 76 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a0fa5192db8e..66b0128b0182 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -482,6 +482,86 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, return retval; } +static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags) +{ + struct extent_status es; + unsigned int status; + int err, retval = 0; + + /* + * Here we clear m_flags because after allocating an new extent, + * it will be set again. + */ + map->m_flags &= ~EXT4_MAP_FLAGS; + + /* + * We need to check for EXT4 here because migrate could have + * changed the inode type in between. + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags); + + /* + * We allocated new blocks which will result in i_data's + * format changing. Force the migrate to fail by clearing + * migrate flags. + */ + if (retval > 0 && map->m_flags & EXT4_MAP_NEW) + ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + if (retval <= 0) + return retval; + + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode %lu: " + "retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); + } + + /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. We also have to + * unmap metadata before zeroing as otherwise writeback can + * overwrite zeros with stale data from block device. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { + err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, + map->m_len); + if (err) + return err; + } + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if (flags & EXT4_GET_BLOCKS_PRE_IO && + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { + if (ext4_es_is_written(&es)) + return retval; + } + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && + ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, + map->m_lblk + map->m_len - 1)) + status |= EXTENT_STATUS_DELAYED; + + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); + + return retval; +} + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -630,12 +710,6 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; - /* - * Here we clear m_flags because after allocating an new extent, - * it will be set again. - */ - map->m_flags &= ~EXT4_MAP_FLAGS; - /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -643,76 +717,7 @@ found: * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); - - /* - * We need to check for EXT4 here because migrate - * could have changed the inode type in between - */ - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags); - - if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { - /* - * We allocated new blocks which will result in - * i_data's format changing. Force the migrate - * to fail by clearing migrate flags - */ - ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); - } - } - - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - /* - * We have to zeroout blocks before inserting them into extent - * status tree. Otherwise someone could look them up there and - * use them before they are really zeroed. We also have to - * unmap metadata before zeroing as otherwise writeback can - * overwrite zeros with stale data from block device. - */ - if (flags & EXT4_GET_BLOCKS_ZERO && - map->m_flags & EXT4_MAP_MAPPED && - map->m_flags & EXT4_MAP_NEW) { - ret = ext4_issue_zeroout(inode, map->m_lblk, - map->m_pblk, map->m_len); - if (ret) { - retval = ret; - goto out_sem; - } - } - - /* - * If the extent has been zeroed out, we don't need to update - * extent status tree. - */ - if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { - if (ext4_es_is_written(&es)) - goto out_sem; - } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - } - -out_sem: + retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); From 8b8252884f2ff4d28e3ce1a825057b3ad2900c35 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:42 +0800 Subject: [PATCH 37/93] ext4: optimize the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag set When doing block allocation, magic EXT4_GET_BLOCKS_DELALLOC_RESERVE means the allocating range covers a range of delayed allocated clusters, the blocks and quotas have already been reserved in ext4_da_map_blocks(), we should update the reserved space and don't need to claim them again. At the moment, we only set this magic in mpage_map_one_extent() when allocating a range of delayed allocated clusters in the write back path, it makes things complicated since we have to notice and deal with the case of allocating non-delayed allocated clusters separately in ext4_ext_map_blocks(). For example, it we fallocate some blocks that have been delayed allocated, free space would be claimed again in ext4_mb_new_blocks() (this is wrong exactily), and we can't claim quota space again, we have to release the quota reservations made for that previously delayed allocated clusters. Move the position thats set the EXT4_GET_BLOCKS_DELALLOC_RESERVE to where we actually do block allocation, it could simplify above handling a lot, it means that we always set this magic once the allocation range covers delalloc blocks, no need to take care of the allocation path. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240813123452.2824659-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 66b0128b0182..2d8cfeaf290f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -489,6 +489,14 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, unsigned int status; int err, retval = 0; + /* + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE + * indicates that the blocks and quotas has already been + * checked when the data was copied into the page cache. + */ + if (map->m_flags & EXT4_MAP_DELAYED) + flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + /* * Here we clear m_flags because after allocating an new extent, * it will be set again. @@ -2224,11 +2232,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if - * the blocks in question are delalloc blocks. This indicates - * that the blocks and quotas has already been checked when - * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | @@ -2236,8 +2239,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & BIT(BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) From eba8c368c8cb9ea05c08caf3dd1a0d0b87d614dc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:43 +0800 Subject: [PATCH 38/93] ext4: don't set EXTENT_STATUS_DELAYED on allocated blocks Currently, we release delayed allocation reservation when removing delayed extent from extent status tree (which also happens when overwriting one extent with another one). When we allocated unwritten extent under some delayed allocated extent, we don't need the reservation anymore and hence we don't need to preserve the EXT4_MAP_DELAYED status bit. Allocating the new extent blocks will properly release the reservation. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240813123452.2824659-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 9 +-------- fs/ext4/inode.c | 11 ----------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 17dcf13adde2..024cd37d53b3 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -869,14 +869,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, return; BUG_ON(end < lblk); - - if ((status & EXTENT_STATUS_DELAYED) && - (status & EXTENT_STATUS_WRITTEN)) { - ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " - " delayed and written which can potentially " - " cause data loss.", lblk, len); - WARN_ON(1); - } + WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); newes.es_lblk = lblk; newes.es_len = len; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2d8cfeaf290f..9e0c02d5d688 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -558,12 +558,6 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); @@ -682,11 +676,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && - !(status & EXTENT_STATUS_WRITTEN) && - ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, - map->m_lblk + map->m_len - 1)) - status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } From fccd632670408ab3066712aa90cc972b18d1b617 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:44 +0800 Subject: [PATCH 39/93] ext4: let __revise_pending() return newly inserted pendings Let __insert_pending() return 1 after successfully inserting a new pending cluster, and also let __revise_pending() to return the number of of newly inserted pendings. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 024cd37d53b3..4d24b56cfaf0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -887,7 +887,7 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); - if ((err1 || err2 || err3) && revise_pending && !pr) + if ((err1 || err2 || err3 < 0) && revise_pending && !pr) pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); @@ -915,7 +915,7 @@ retry: if (revise_pending) { err3 = __revise_pending(inode, lblk, len, &pr); - if (err3 != 0) + if (err3 < 0) goto error; if (pr) { __free_pending(pr); @@ -924,7 +924,7 @@ retry: } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2 || err3) + if (err1 || err2 || err3 < 0) goto retry; ext4_es_print_tree(inode); @@ -1933,7 +1933,7 @@ static struct pending_reservation *__get_pending(struct inode *inode, * @lblk - logical block in the cluster to be added * @prealloc - preallocated pending entry * - * Returns 0 on successful insertion and -ENOMEM on failure. If the + * Returns 1 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, @@ -1977,6 +1977,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, rb_link_node(&pr->rb_node, parent, p); rb_insert_color(&pr->rb_node, &tree->root); + ret = 1; out: return ret; @@ -2098,7 +2099,7 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); - if (err1 || err2 || err3) { + if (err1 || err2 || err3 < 0) { if (lclu_allocated && !pr1) pr1 = __alloc_pending(true); if (end_allocated && !pr2) @@ -2128,7 +2129,7 @@ retry: if (lclu_allocated) { err3 = __insert_pending(inode, lblk, &pr1); - if (err3 != 0) + if (err3 < 0) goto error; if (pr1) { __free_pending(pr1); @@ -2137,7 +2138,7 @@ retry: } if (end_allocated) { err3 = __insert_pending(inode, end, &pr2); - if (err3 != 0) + if (err3 < 0) goto error; if (pr2) { __free_pending(pr2); @@ -2146,7 +2147,7 @@ retry: } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2 || err3) + if (err1 || err2 || err3 < 0) goto retry; ext4_es_print_tree(inode); @@ -2256,7 +2257,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten - * status. Must be called while holding i_es_lock. + * status. Must be called while holding i_es_lock. Returns number of new + * inserts pending cluster on insert pendings, returns 0 on remove pendings, + * return -ENOMEM on failure. */ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, @@ -2266,6 +2269,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; + int pendings = 0; int ret = 0; if (len == 0) @@ -2293,6 +2297,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; + pendings += ret; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; @@ -2304,6 +2309,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, last); } @@ -2316,6 +2322,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ret = __insert_pending(inode, first, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, first); @@ -2327,9 +2334,10 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, ret = __insert_pending(inode, last, prealloc); if (ret < 0) goto out; + pendings += ret; } else __remove_pending(inode, last); } out: - return ret; + return (ret < 0) ? ret : pendings; } From f3baf33b9cae0e00fe1870abca952d5dfea53dc6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:45 +0800 Subject: [PATCH 40/93] ext4: passing block allocation information to ext4_es_insert_extent() Just pass the block allocation flag to ext4_es_insert_extent() when we replacing a current extent after an actually block allocation or extent status conversion, this flag will be used by later changes. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +++-- fs/ext4/extents_status.c | 6 +++--- fs/ext4/extents_status.h | 2 +- fs/ext4/inode.c | 8 ++++---- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e067f2dd0335..671dacd7c873 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3113,7 +3113,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); + EXTENT_STATUS_WRITTEN, 0); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -4097,7 +4097,8 @@ again: insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); - ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE); + ext4_es_insert_extent(inode, hole_start, len, ~0, + EXTENT_STATUS_HOLE, 0); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 4d24b56cfaf0..0580bc4bc762 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -848,7 +848,7 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status) + unsigned int status, int flags) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; @@ -862,8 +862,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, inode->i_ino); + es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n", + lblk, len, pblk, status, flags, inode->i_ino); if (!len) return; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 3c8e2edee5d5..b74d693c1adb 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,7 +129,7 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status); + unsigned int status, int flags); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9e0c02d5d688..72f2b6ce97c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -478,7 +478,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); + map->m_pblk, status, 0); return retval; } @@ -559,7 +559,7 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); + map->m_pblk, status, flags); return retval; } @@ -677,7 +677,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); + map->m_pblk, status, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -4065,7 +4065,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) stop_block); ext4_es_insert_extent(inode, first_block, hole_len, ~0, - EXTENT_STATUS_HOLE); + EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); From c543e2429640293d9eda8c7841d4b5d5e8682826 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:46 +0800 Subject: [PATCH 41/93] ext4: update delalloc data reserve spcae in ext4_es_insert_extent() Now that we update data reserved space for delalloc after allocating new blocks in ext4_{ind|ext}_map_blocks(), and if bigalloc feature is enabled, we also need to query the extents_status tree to calculate the exact reserved clusters. This is complicated now and it appears that it's better to do this job in ext4_es_insert_extent(), because __es_remove_extent() have already count delalloc blocks when removing delalloc extents and __revise_pending() return new adding pending count, we could update the reserved blocks easily in ext4_es_insert_extent(). We direct reduce the reserved cluster count when replacing a delalloc extent. However, thers are two special cases need to concern about the quota claiming when doing direct block allocation (e.g. from fallocate). A), fallocate a range that covers a delalloc extent but start with non-delayed allocated blocks, e.g. a hole. hhhhhhh+ddddddd+ddddddd ^^^^^^^^^^^^^^^^^^^^^^^ fallocate this range Current ext4_map_blocks() can't always trim the extent since it may release i_data_sem before calling ext4_map_create_blocks() and raced by another delayed allocation. Hence the EXT4_GET_BLOCKS_DELALLOC_RESERVE may not set even when we are replacing a delalloc extent, without this flag set, the quota has already been claimed by ext4_mb_new_blocks(), so we should release the quota reservations instead of claim them again. B), bigalloc feature is enabled, fallocate a range that contains non-delayed allocated blocks. |< one cluster >| hhhhhhh+hhhhhhh+hhhhhhh+ddddddd ^^^^^^^ fallocate this range This case is similar to above case, the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag is also not set. Hence we should release the quota reservations if we replace a delalloc extent but without EXT4_GET_BLOCKS_DELALLOC_RESERVE set. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 37 ------------------------------------- fs/ext4/extents_status.c | 25 ++++++++++++++++++++++++- fs/ext4/indirect.c | 7 ------- 3 files changed, 24 insertions(+), 45 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 671dacd7c873..db8f9d79477c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4357,43 +4357,6 @@ got_allocated_blocks: goto out; } - /* - * Reduce the reserved cluster count to reflect successful deferred - * allocation of delayed allocated clusters or direct allocation of - * clusters discovered to be delayed allocated. Once allocated, a - * cluster is not included in the reserved count. - */ - if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { - /* - * When allocating delayed allocated clusters, simply - * reduce the reserved cluster count and claim quota - */ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); - } else { - ext4_lblk_t lblk, len; - unsigned int n; - - /* - * When allocating non-delayed allocated clusters - * (from fallocate, filemap, DIO, or clusters - * allocated when delalloc has been disabled by - * ext4_nonda_switch), reduce the reserved cluster - * count by the number of allocated clusters that - * have previously been delayed allocated. Quota - * has been claimed by ext4_mb_new_blocks() above, - * so release the quota reservations made for any - * previously delayed allocated clusters. - */ - lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk); - len = allocated_clusters << sbi->s_cluster_bits; - n = ext4_es_delayed_clu(inode, lblk, len); - if (n > 0) - ext4_da_update_reserve_space(inode, (int) n, 0); - } - } - /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an unwritten extent. diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0580bc4bc762..41adf0d69959 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -853,6 +853,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err1 = 0, err2 = 0, err3 = 0; + int resv_used = 0, pending = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; @@ -891,7 +892,7 @@ retry: pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); - err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1); if (err1 != 0) goto error; /* Free preallocated extent if it didn't get used. */ @@ -921,9 +922,31 @@ retry: __free_pending(pr); pr = NULL; } + pending = err3; } error: write_unlock(&EXT4_I(inode)->i_es_lock); + /* + * Reduce the reserved cluster count to reflect successful deferred + * allocation of delayed allocated clusters or direct allocation of + * clusters discovered to be delayed allocated. Once allocated, a + * cluster is not included in the reserved count. + * + * When direct allocating (from fallocate, filemap, DIO, or clusters + * allocated when delalloc has been disabled by ext4_nonda_switch()) + * an extent either 1) contains delayed blocks but start with + * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed + * allocated blocks which belong to delayed allocated clusters when + * bigalloc feature is enabled, quota has already been claimed by + * ext4_mb_new_blocks(), so release the quota reservations made for + * any previously delayed allocated clusters instead of claim them + * again. + */ + resv_used += pending; + if (resv_used) + ext4_da_update_reserve_space(inode, resv_used, + flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (err1 || err2 || err3 < 0) goto retry; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index d8ca7f64f952..7404f0935c90 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; - /* - * Update reserved blocks/metadata blocks after successful block - * allocation which had been deferred till now. - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_da_update_reserve_space(inode, count, 1); - got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); From 6e124d5b4b02229f8aaa206b1952db31d1687523 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:47 +0800 Subject: [PATCH 42/93] ext4: drop ext4_es_delayed_clu() Since we move ext4_da_update_reserve_space() to ext4_es_insert_extent(), no one uses ext4_es_delayed_clu() and __es_delayed_clu(), just drop them. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 88 ---------------------------------------- fs/ext4/extents_status.h | 2 - 2 files changed, 90 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 41adf0d69959..b372b98af366 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -2178,94 +2178,6 @@ error: return; } -/* - * __es_delayed_clu - count number of clusters containing blocks that - * are delayed only - * - * @inode - file containing block range - * @start - logical block defining start of range - * @end - logical block defining end of range - * - * Returns the number of clusters containing only delayed (not delayed - * and unwritten) blocks in the range specified by @start and @end. Any - * cluster or part of a cluster within the range and containing a delayed - * and not unwritten block within the range is counted as a whole cluster. - */ -static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start, - ext4_lblk_t end) -{ - struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; - struct extent_status *es; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct rb_node *node; - ext4_lblk_t first_lclu, last_lclu; - unsigned long long last_counted_lclu; - unsigned int n = 0; - - /* guaranteed to be unequal to any ext4_lblk_t value */ - last_counted_lclu = ~0ULL; - - es = __es_tree_search(&tree->root, start); - - while (es && (es->es_lblk <= end)) { - if (ext4_es_is_delonly(es)) { - if (es->es_lblk <= start) - first_lclu = EXT4_B2C(sbi, start); - else - first_lclu = EXT4_B2C(sbi, es->es_lblk); - - if (ext4_es_end(es) >= end) - last_lclu = EXT4_B2C(sbi, end); - else - last_lclu = EXT4_B2C(sbi, ext4_es_end(es)); - - if (first_lclu == last_counted_lclu) - n += last_lclu - first_lclu; - else - n += last_lclu - first_lclu + 1; - last_counted_lclu = last_lclu; - } - node = rb_next(&es->rb_node); - if (!node) - break; - es = rb_entry(node, struct extent_status, rb_node); - } - - return n; -} - -/* - * ext4_es_delayed_clu - count number of clusters containing blocks that - * are both delayed and unwritten - * - * @inode - file containing block range - * @lblk - logical block defining start of range - * @len - number of blocks in range - * - * Locking for external use of __es_delayed_clu(). - */ -unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - ext4_lblk_t end; - unsigned int n; - - if (len == 0) - return 0; - - end = lblk + len - 1; - WARN_ON(end < lblk); - - read_lock(&ei->i_es_lock); - - n = __es_delayed_clu(inode, lblk, end); - - read_unlock(&ei->i_es_lock); - - return n; -} - /* * __revise_pending - makes, cancels, or leaves unchanged pending cluster * reservations for a specified block range depending diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index b74d693c1adb..47b3b55a852c 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -252,8 +252,6 @@ extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated); -extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ From 15996a848564e40a3d030ec7e4603dddb9f425b6 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:48 +0800 Subject: [PATCH 43/93] ext4: use ext4_map_query_blocks() in ext4_map_blocks() The blocks map querying logic in ext4_map_blocks() are the same as ext4_map_query_blocks(), so switch to directly use it. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240813123452.2824659-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 72f2b6ce97c0..1fa59b0fc7f3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -658,27 +658,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, 0); - } else { - retval = ext4_ind_map_blocks(handle, inode, map, 0); - } - if (retval > 0) { - unsigned int status; - - if (unlikely(retval != map->m_len)) { - ext4_warning(inode->i_sb, - "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", - inode->i_ino, retval, map->m_len); - WARN_ON(1); - } - - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, 0); - } + retval = ext4_map_query_blocks(handle, inode, map); up_read((&EXT4_I(inode)->i_data_sem)); found: From 3b4ba269ab6673d664d2522a0e76797a3550983f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:49 +0800 Subject: [PATCH 44/93] ext4: drop unused ext4_es_store_status() The helper ext4_es_store_status() is unused now, just drop it. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 47b3b55a852c..3ca40f018994 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -224,13 +224,6 @@ static inline void ext4_es_store_pblock(struct extent_status *es, es->es_pblk = block; } -static inline void ext4_es_store_status(struct extent_status *es, - unsigned int status) -{ - es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | - (es->es_pblk & ~ES_MASK); -} - static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) From ce09036ea4f0a54e9dcd7ba644bb1db7cf2d95d4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:50 +0800 Subject: [PATCH 45/93] ext4: make extent status types exclusive Since we don't add delayed flag in unwritten extents, all of the four extent status types EXTENT_STATUS_WRITTEN, EXTENT_STATUS_UNWRITTEN, EXTENT_STATUS_DELAYED and EXTENT_STATUS_HOLE are exclusive now, add assertion when storing pblock before inserting extent into status tree and add comment to the status definition. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-11-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 3ca40f018994..7d7af642f7b2 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -42,6 +42,10 @@ enum { #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) +/* + * Besides EXTENT_STATUS_REFERENCED, all these extent type masks + * are exclusive, only one type can be set at a time. + */ #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) @@ -51,7 +55,9 @@ enum { #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) << ES_SHIFT) + EXTENT_STATUS_HOLE)) + +#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) struct ext4_sb_info; struct ext4_extent; @@ -156,7 +162,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es) static inline unsigned int ext4_es_type(struct extent_status *es) { - return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; + return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; } static inline int ext4_es_is_written(struct extent_status *es) @@ -228,6 +234,8 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { + WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } From b224b18497484eef9d2dbb3c803888a3f3a3475e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:51 +0800 Subject: [PATCH 46/93] ext4: drop ext4_es_is_delonly() Since we don't add delayed flag in unwritten extents, so there is no difference between ext4_es_is_delayed() and ext4_es_is_delonly(), just drop ext4_es_is_delonly(). Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-12-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 18 +++++++++--------- fs/ext4/extents_status.h | 5 ----- fs/ext4/inode.c | 4 ++-- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index b372b98af366..68c47ecc01a5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -558,8 +558,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (ext4_es_is_hole(es1)) return 1; - /* we need to check delayed extent is without unwritten status */ - if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + /* we need to check delayed extent */ + if (ext4_es_is_delayed(es1)) return 1; return 0; @@ -1135,7 +1135,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t i, end, nclu; - if (!ext4_es_is_delonly(es)) + if (!ext4_es_is_delayed(es)) return; WARN_ON(len <= 0); @@ -1285,7 +1285,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { - if (ext4_es_is_delonly(es)) { + if (ext4_es_is_delayed(es)) { rc->ndelonly--; left_delonly = true; break; @@ -1305,7 +1305,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { - if (ext4_es_is_delonly(es)) { + if (ext4_es_is_delayed(es)) { rc->ndelonly--; right_delonly = true; break; @@ -2226,7 +2226,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); @@ -2238,7 +2238,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, - &ext4_es_is_delonly, + &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); @@ -2251,7 +2251,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, } else { first = EXT4_LBLK_CMASK(sbi, lblk); if (first != lblk) - f_del = __es_scan_range(inode, &ext4_es_is_delonly, + f_del = __es_scan_range(inode, &ext4_es_is_delayed, first, lblk - 1); if (f_del) { ret = __insert_pending(inode, first, prealloc); @@ -2263,7 +2263,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) - l_del = __es_scan_range(inode, &ext4_es_is_delonly, + l_del = __es_scan_range(inode, &ext4_es_is_delayed, end + 1, last); if (l_del) { ret = __insert_pending(inode, last, prealloc); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 7d7af642f7b2..4424232de298 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -190,11 +190,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es) return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } -static inline int ext4_es_is_delonly(struct extent_status *es) -{ - return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es)); -} - static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1fa59b0fc7f3..a6f1286e824d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1645,7 +1645,7 @@ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) int ret; /* Has delalloc reservation? */ - if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) + if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ @@ -1766,7 +1766,7 @@ found: * Delayed extent could be allocated by fallocate. * So we need to check it. */ - if (ext4_es_is_delonly(&es)) { + if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } From 2046657e64a11b61d5ed07e0d60befd86303125e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 13 Aug 2024 20:34:52 +0800 Subject: [PATCH 47/93] ext4: drop all delonly descriptions When counting reserved clusters, delayed type is always equal to delonly type now, hence drop all delonly descriptions in parameters and comments. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20240813123452.2824659-13-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 66 +++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 68c47ecc01a5..c786691dabd3 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1067,7 +1067,7 @@ out: } struct rsvd_count { - int ndelonly; + int ndelayed; bool first_do_lblk_found; ext4_lblk_t first_do_lblk; ext4_lblk_t last_do_lblk; @@ -1093,10 +1093,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct rb_node *node; - rc->ndelonly = 0; + rc->ndelayed = 0; /* - * for bigalloc, note the first delonly block in the range has not + * for bigalloc, note the first delayed block in the range has not * been found, record the extent containing the block to the left of * the region to be removed, if any, and note that there's no partial * cluster to track @@ -1116,9 +1116,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, } /* - * count_rsvd - count the clusters containing delayed and not unwritten - * (delonly) blocks in a range within an extent and add to - * the running tally in rsvd_count + * count_rsvd - count the clusters containing delayed blocks in a range + * within an extent and add to the running tally in rsvd_count * * @inode - file containing extent * @lblk - first block in range @@ -1141,7 +1140,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, WARN_ON(len <= 0); if (sbi->s_cluster_ratio == 1) { - rc->ndelonly += (int) len; + rc->ndelayed += (int) len; return; } @@ -1151,7 +1150,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = lblk + (ext4_lblk_t) len - 1; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; - /* record the first block of the first delonly extent seen */ + /* record the first block of the first delayed extent seen */ if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; @@ -1165,7 +1164,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, * doesn't start with it, count it and stop tracking */ if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; } @@ -1175,7 +1174,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, */ if (EXT4_LBLK_COFF(sbi, i) != 0) { if (end >= EXT4_LBLK_CFILL(sbi, i)) { - rc->ndelonly++; + rc->ndelayed++; rc->partial = false; i = EXT4_LBLK_CFILL(sbi, i) + 1; } @@ -1183,11 +1182,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, /* * if the current cluster starts on a cluster boundary, count the - * number of whole delonly clusters in the extent + * number of whole delayed clusters in the extent */ if ((i + sbi->s_cluster_ratio - 1) <= end) { nclu = (end - i + 1) >> sbi->s_cluster_bits; - rc->ndelonly += nclu; + rc->ndelayed += nclu; i += nclu << sbi->s_cluster_bits; } @@ -1247,10 +1246,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root, * @rc - pointer to reserved count data * * The number of reservations to be released is equal to the number of - * clusters containing delayed and not unwritten (delonly) blocks within - * the range, minus the number of clusters still containing delonly blocks - * at the ends of the range, and minus the number of pending reservations - * within the range. + * clusters containing delayed blocks within the range, minus the number of + * clusters still containing delayed blocks at the ends of the range, and + * minus the number of pending reservations within the range. */ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct extent_status *right_es, @@ -1261,33 +1259,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct rb_node *node; ext4_lblk_t first_lclu, last_lclu; - bool left_delonly, right_delonly, count_pending; + bool left_delayed, right_delayed, count_pending; struct extent_status *es; if (sbi->s_cluster_ratio > 1) { /* count any remaining partial cluster */ if (rc->partial) - rc->ndelonly++; + rc->ndelayed++; - if (rc->ndelonly == 0) + if (rc->ndelayed == 0) return 0; first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); /* - * decrease the delonly count by the number of clusters at the - * ends of the range that still contain delonly blocks - + * decrease the delayed count by the number of clusters at the + * ends of the range that still contain delayed blocks - * these clusters still need to be reserved */ - left_delonly = right_delonly = false; + left_delayed = right_delayed = false; es = rc->left_es; while (es && ext4_es_end(es) >= EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { if (ext4_es_is_delayed(es)) { - rc->ndelonly--; - left_delonly = true; + rc->ndelayed--; + left_delayed = true; break; } node = rb_prev(&es->rb_node); @@ -1295,7 +1293,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, break; es = rb_entry(node, struct extent_status, rb_node); } - if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (right_es && (!left_delayed || first_lclu != last_lclu)) { if (end < ext4_es_end(right_es)) { es = right_es; } else { @@ -1306,8 +1304,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, while (es && es->es_lblk <= EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { if (ext4_es_is_delayed(es)) { - rc->ndelonly--; - right_delonly = true; + rc->ndelayed--; + right_delayed = true; break; } node = rb_next(&es->rb_node); @@ -1321,21 +1319,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * Determine the block range that should be searched for * pending reservations, if any. Clusters on the ends of the - * original removed range containing delonly blocks are + * original removed range containing delayed blocks are * excluded. They've already been accounted for and it's not * possible to determine if an associated pending reservation * should be released with the information available in the * extents status tree. */ if (first_lclu == last_lclu) { - if (left_delonly | right_delonly) + if (left_delayed | right_delayed) count_pending = false; else count_pending = true; } else { - if (left_delonly) + if (left_delayed) first_lclu++; - if (right_delonly) + if (right_delayed) last_lclu--; if (first_lclu <= last_lclu) count_pending = true; @@ -1346,13 +1344,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, /* * a pending reservation found between first_lclu and last_lclu * represents an allocated cluster that contained at least one - * delonly block, so the delonly total must be reduced by one + * delayed block, so the delayed total must be reduced by one * for each pending reservation found and released */ if (count_pending) { pr = __pr_tree_search(&tree->root, first_lclu); while (pr && pr->lclu <= last_lclu) { - rc->ndelonly--; + rc->ndelayed--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); __free_pending(pr); @@ -1363,7 +1361,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, } } } - return rc->ndelonly; + return rc->ndelayed; } From 20cee68f5b44fdc2942d20f3172a262ec247b117 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Sat, 17 Aug 2024 16:55:10 +0800 Subject: [PATCH 48/93] ext4: clear EXT4_GROUP_INFO_WAS_TRIMMED_BIT even mount with discard Commit 3d56b8d2c74c ("ext4: Speed up FITRIM by recording flags in ext4_group_info") speed up fstrim by skipping trim trimmed group. We also has the chance to clear trimmed once there exists some block free for this group(mount without discard), and the next trim for this group will work well too. For mount with discard, we will issue dicard when we free blocks, so leave trimmed flag keep alive to skip useless trim trigger from userspace seems reasonable. But for some case like ext4 build on dm-thinpool(ext4 blocksize 4K, pool blocksize 128K), discard from ext4 maybe unaligned for dm thinpool, and thinpool will just finish this discard(see process_discard_bio when begein equals to end) without actually process discard. For this case, trim from userspace can really help us to free some thinpool block. So convert to clear trimmed flag for all case no matter mounted with discard or not. Fixes: 3d56b8d2c74c ("ext4: Speed up FITRIM by recording flags in ext4_group_info") Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240817085510.2084444-1-yangerkun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2008d2d524c9..bf9ba7effbd5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3886,11 +3886,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. - * If the volume is mounted with -o discard, online discard - * is supported and the free blocks will be trimmed online. */ - if (!test_opt(sb, DISCARD)) - EXT4_MB_GRP_CLEAR_TRIMMED(db); + EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree @@ -6514,8 +6511,9 @@ do_more: " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); - } else - EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + } + + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); From 5e5b2a56c57def1b41efd49596621504d7bcc61c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:28 +0800 Subject: [PATCH 49/93] ext4: avoid buffer_head leak in ext4_mark_inode_used() Release inode_bitmap_bh from ext4_read_inode_bitmap() in ext4_mark_inode_used() to avoid buffer_head leak. By the way, remove unneeded goto for invalid ino when inode_bitmap_bh is NULL. Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9dfd768ed9f8..ad7f13976dc6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -755,10 +755,10 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) struct ext4_group_desc *gdp; ext4_group_t group; int bit; - int err = -EFSCORRUPTED; + int err; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) - goto out; + return -EFSCORRUPTED; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); @@ -860,6 +860,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: + brelse(inode_bitmap_bh); return err; } From 227d31b9214d1b9513383cf6c7180628d4b3b61f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:29 +0800 Subject: [PATCH 50/93] ext4: avoid potential buffer_head leak in __ext4_new_inode() If a group is marked EXT4_GROUP_INFO_IBITMAP_CORRUPT after it's inode bitmap buffer_head was successfully verified, then __ext4_new_inode() will get a valid inode_bitmap_bh of a corrupted group from ext4_read_inode_bitmap() in which case inode_bitmap_bh misses a release. Hnadle "IS_ERR(inode_bitmap_bh)" and group corruption separately like how ext4_free_inode() does to avoid buffer_head leak. Fixes: 9008a58e5dce ("ext4: make the bitmap read routines return real error codes") Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ad7f13976dc6..9e2c08a8665f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1054,12 +1054,13 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ - if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) - && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || - IS_ERR(inode_bitmap_bh)) { + if (IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } + if (!(sbi->s_mount_state & EXT4_FC_REPLAY) && + EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); From bb0a12c3439b10d88412fd3102df5b9a6e3cd6dc Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:30 +0800 Subject: [PATCH 51/93] ext4: avoid negative min_clusters in find_group_orlov() min_clusters is signed integer and will be converted to unsigned integer when compared with unsigned number stats.free_clusters. If min_clusters is negative, it will be converted to a huge unsigned value in which case all groups may not meet the actual desired free clusters. Set negative min_clusters to 0 to avoid unexpected behavior. Fixes: ac27a0ec112a ("[PATCH] ext4: initial copy of files from ext3") Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9e2c08a8665f..81641be38c0e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -514,6 +514,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; + if (min_clusters < 0) + min_clusters = 0; /* * Start looking in the flex group where we last allocated an From f7c69be505a5ec3df13b65cedb245343b2a5f751 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:31 +0800 Subject: [PATCH 52/93] ext4: remove dead check in __ext4_new_inode() If we can't grab any inode, the prvious find_inode_bit() will set ino to be >= EXT4_INODES_PER_GROUP(sb). So the check of need to repeat in the same group is not needed. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 81641be38c0e..8ee4754c70f5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1064,7 +1064,6 @@ got_group: EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; -repeat_in_this_group: ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; @@ -1114,8 +1113,6 @@ repeat_in_this_group: if (!ret2) goto got; /* we grabbed the inode! */ - if (ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; next_group: if (++group == ngroups) group = 0; From 66eafbde7d74e36cd80e6b6246ea4bd599416e2c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:32 +0800 Subject: [PATCH 53/93] ext4: move checksum length calculation of inode bitmap into ext4_inode_bitmap_csum_[verify/set]() functions There are some little improve: 1. remove repeat code to calculate checksum length of inode bitmap 2. remove unnecessary checksum length calculation if checksum is not enabled. 3. use more efficient bit shift operation instead of div opreation. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 8 ++++++-- fs/ext4/ext4.h | 4 ++-- fs/ext4/ialloc.c | 12 ++++-------- fs/ext4/resize.c | 3 +-- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index cd725bebe69e..2a135075468d 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -18,15 +18,17 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 hi; __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; if (!ext4_has_metadata_csum(sb)) return 1; + sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { @@ -40,14 +42,16 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz) + struct buffer_head *bh) { __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); + int sz; if (!ext4_has_metadata_csum(sb)) return; + sz = EXT4_INODES_PER_GROUP(sb) >> 3; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5845e4aa091a..7c3438b431e4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2693,10 +2693,10 @@ struct mmpd_data { extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, - struct buffer_head *bh, int sz); + struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8ee4754c70f5..0234f5e91d8d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8) || + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " @@ -327,8 +326,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -853,8 +851,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1225,8 +1222,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0ba9837d65ca..e04eb08b9060 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1319,8 +1319,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + ext4_inode_bitmap_csum_set(sb, gdp, bh); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); From 7523a7ef099abe7b9a98c4c8f722fe40541c9723 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:33 +0800 Subject: [PATCH 54/93] ext4: remove unneeded NULL check of buffer_head in ext4_mark_inode_used() If gdp from ext4_get_group_desc() is not NULL, then returned group_desc_bh won't be NULL either. Remove check of group_desc_bh and only check returned gdp from ext4_get_group_desc() like how other callers do. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 0234f5e91d8d..daacb9f0ef50 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -772,7 +772,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); - if (!gdp || !group_desc_bh) { + if (!gdp) { err = -EINVAL; goto out; } From 7d2b48881877ace14ea85a7e3a17ff8f80f3d8e6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 20 Aug 2024 21:22:34 +0800 Subject: [PATCH 55/93] ext4: check buffer_verified in advance to avoid unneeded ext4_get_group_info() Check buffer_verified in advance to avoid unneeded ext4_get_group_info(). This could be a simple cleanup as compiler may handle this. Signed-off-by: Kemeng Shi Link: https://patch.msgid.link/20240820132234.2759926-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index daacb9f0ef50..7f1a5f90dbbd 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; - grp = ext4_get_group_info(sb, block_group); - if (buffer_verified(bh)) return 0; + + grp = ext4_get_group_info(sb, block_group); if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; From cd69f8f9de280e331c9e6ff689ced0a688a9ce8f Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 21 Aug 2024 12:23:21 -0300 Subject: [PATCH 56/93] ext4: ext4_search_dir should return a proper error ext4_search_dir currently returns -1 in case of a failure, while it returns 0 when the name is not found. In such failure cases, it should return an error code instead. This becomes even more important when ext4_find_inline_entry returns an error code as well in the next commit. -EFSCORRUPTED seems appropriate as such error code as these failures would be caused by unexpected record lengths and is in line with other instances of ext4_check_dir_entry failures. In the case of ext4_dx_find_entry, the current use of ERR_BAD_DX_DIR was left as is to reduce the risk of regressions. Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://patch.msgid.link/20240821152324.3621860-2-cascardo@igalia.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1de1c1b460a7..7a659a31f837 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1482,7 +1482,7 @@ static bool ext4_match(struct inode *parent, } /* - * Returns 0 if not found, -1 on failure, and 1 on success + * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success */ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, @@ -1503,7 +1503,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, * a full check */ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, buf_size, offset)) - return -1; + return -EFSCORRUPTED; *res_dir = de; return 1; } @@ -1511,7 +1511,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); if (de_len <= 0) - return -1; + return -EFSCORRUPTED; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } @@ -1663,8 +1663,10 @@ restart: goto cleanup_and_exit; } else { brelse(bh); - if (i < 0) + if (i < 0) { + ret = ERR_PTR(i); goto cleanup_and_exit; + } } next: if (++block >= nblocks) @@ -1758,7 +1760,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, if (retval == 1) goto success; brelse(bh); - if (retval == -1) { + if (retval < 0) { bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } From 4d231b91a944f3cab355fce65af5871fb5d7735b Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 21 Aug 2024 12:23:22 -0300 Subject: [PATCH 57/93] ext4: return error on ext4_find_inline_entry In case of errors when reading an inode from disk or traversing inline directory entries, return an error-encoded ERR_PTR instead of returning NULL. ext4_find_inline_entry only caller, __ext4_find_entry already returns such encoded errors. Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://patch.msgid.link/20240821152324.3621860-3-cascardo@igalia.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e7a09a99837b..7b98b1bf1dc9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1669,8 +1669,9 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, void *inline_start; int inline_size; - if (ext4_get_inode_loc(dir, &iloc)) - return NULL; + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ERR_PTR(ret); down_read(&EXT4_I(dir)->xattr_sem); if (!ext4_has_inline_data(dir)) { @@ -1701,7 +1702,10 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, out: brelse(iloc.bh); - iloc.bh = NULL; + if (ret < 0) + iloc.bh = ERR_PTR(ret); + else + iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); return iloc.bh; From 51e14e78b5fb3e6f839393cd2d34386ee7b69af3 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 21 Aug 2024 12:23:23 -0300 Subject: [PATCH 58/93] ext4: explicitly exit when ext4_find_inline_entry returns an error __ext4_find_entry currently ignores the return of ext4_find_inline_entry, except for returning the bh or NULL when has_inline_data is 1. Even though has_inline_data is set to 1 before calling ext4_find_inline_entry and would only be set to 0 when that function returns NULL, check for an encoded error return explicitly in order to exit. That makes the code more readable, not requiring that one assumes the cases when has_inline_data is 1. Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://patch.msgid.link/20240821152324.3621860-4-cascardo@igalia.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7a659a31f837..790db7eac6c2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1574,7 +1574,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, &has_inline_data); if (inlined) *inlined = has_inline_data; - if (has_inline_data) + if (has_inline_data || IS_ERR(ret)) goto cleanup_and_exit; } From c6b72f5d82b1017bad80f9ebf502832fc321d796 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 21 Aug 2024 12:23:24 -0300 Subject: [PATCH 59/93] ext4: avoid OOB when system.data xattr changes underneath the filesystem When looking up for an entry in an inlined directory, if e_value_offs is changed underneath the filesystem by some change in the block device, it will lead to an out-of-bounds access that KASAN detects as an UAF. EXT4-fs (loop0): mounted filesystem 00000000-0000-0000-0000-000000000000 r/w without journal. Quota mode: none. loop0: detected capacity change from 2048 to 2047 ================================================================== BUG: KASAN: use-after-free in ext4_search_dir+0xf2/0x1c0 fs/ext4/namei.c:1500 Read of size 1 at addr ffff88803e91130f by task syz-executor269/5103 CPU: 0 UID: 0 PID: 5103 Comm: syz-executor269 Not tainted 6.11.0-rc4-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 ext4_search_dir+0xf2/0x1c0 fs/ext4/namei.c:1500 ext4_find_inline_entry+0x4be/0x5e0 fs/ext4/inline.c:1697 __ext4_find_entry+0x2b4/0x1b30 fs/ext4/namei.c:1573 ext4_lookup_entry fs/ext4/namei.c:1727 [inline] ext4_lookup+0x15f/0x750 fs/ext4/namei.c:1795 lookup_one_qstr_excl+0x11f/0x260 fs/namei.c:1633 filename_create+0x297/0x540 fs/namei.c:3980 do_symlinkat+0xf9/0x3a0 fs/namei.c:4587 __do_sys_symlinkat fs/namei.c:4610 [inline] __se_sys_symlinkat fs/namei.c:4607 [inline] __x64_sys_symlinkat+0x95/0xb0 fs/namei.c:4607 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f3e73ced469 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 21 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff4d40c258 EFLAGS: 00000246 ORIG_RAX: 000000000000010a RAX: ffffffffffffffda RBX: 0032656c69662f2e RCX: 00007f3e73ced469 RDX: 0000000020000200 RSI: 00000000ffffff9c RDI: 00000000200001c0 RBP: 0000000000000000 R08: 00007fff4d40c290 R09: 00007fff4d40c290 R10: 0023706f6f6c2f76 R11: 0000000000000246 R12: 00007fff4d40c27c R13: 0000000000000003 R14: 431bde82d7b634db R15: 00007fff4d40c2b0 Calling ext4_xattr_ibody_find right after reading the inode with ext4_get_inode_loc will lead to a check of the validity of the xattrs, avoiding this problem. Reported-by: syzbot+0c2508114d912a54ee79@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0c2508114d912a54ee79 Fixes: e8e948e7802a ("ext4: let ext4_find_entry handle inline data") Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://patch.msgid.link/20240821152324.3621860-5-cascardo@igalia.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7b98b1bf1dc9..44a5f6df59ec 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1664,25 +1664,36 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) { + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; int ret; - struct ext4_iloc iloc; void *inline_start; int inline_size; - ret = ext4_get_inode_loc(dir, &iloc); + ret = ext4_get_inode_loc(dir, &is.iloc); if (ret) return ERR_PTR(ret); down_read(&EXT4_I(dir)->xattr_sem); + + ret = ext4_xattr_ibody_find(dir, &i, &is); + if (ret) + goto out; + if (!ext4_has_inline_data(dir)) { *has_inline_data = 0; goto out; } - inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; @@ -1692,23 +1703,23 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) goto out; - inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size, dir, fname, 0, res_dir); if (ret == 1) goto out_find; out: - brelse(iloc.bh); + brelse(is.iloc.bh); if (ret < 0) - iloc.bh = ERR_PTR(ret); + is.iloc.bh = ERR_PTR(ret); else - iloc.bh = NULL; + is.iloc.bh = NULL; out_find: up_read(&EXT4_I(dir)->xattr_sem); - return iloc.bh; + return is.iloc.bh; } int ext4_delete_inline_entry(handle_t *handle, From edfa71dbe841075109e3e1da7d3925b45328ed25 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:21 +0800 Subject: [PATCH 60/93] ext4: refactor ext4_ext_rm_idx() to index 'path' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As suggested by Honza in Link,modify ext4_ext_rm_idx() to leave 'path' alone and just index it like ext4_ext_correct_indexes() does it. This facilitates adding error handling later. No functional changes. Suggested-by: Jan Kara Link: https://lore.kernel.org/all/20230216130305.nrbtd42tppxhbynn@quack3/ Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-2-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index db8f9d79477c..ca08b4036e28 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2279,27 +2279,26 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, { int err; ext4_fsblk_t leaf; + int k = depth - 1; /* free index block */ - depth--; - path = path + depth; - leaf = ext4_idx_pblock(path->p_idx); - if (unlikely(path->p_hdr->eh_entries == 0)) { - EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); + leaf = ext4_idx_pblock(path[k].p_idx); + if (unlikely(path[k].p_hdr->eh_entries == 0)) { + EXT4_ERROR_INODE(inode, "path[%d].p_hdr->eh_entries == 0", k); return -EFSCORRUPTED; } - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) return err; - if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { - int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; + if (path[k].p_idx != EXT_LAST_INDEX(path[k].p_hdr)) { + int len = EXT_LAST_INDEX(path[k].p_hdr) - path[k].p_idx; len *= sizeof(struct ext4_extent_idx); - memmove(path->p_idx, path->p_idx + 1, len); + memmove(path[k].p_idx, path[k].p_idx + 1, len); } - le16_add_cpu(&path->p_hdr->eh_entries, -1); - err = ext4_ext_dirty(handle, inode, path); + le16_add_cpu(&path[k].p_hdr->eh_entries, -1); + err = ext4_ext_dirty(handle, inode, path + k); if (err) return err; ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); @@ -2308,15 +2307,14 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - while (--depth >= 0) { - if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + while (--k >= 0) { + if (path[k + 1].p_idx != EXT_FIRST_INDEX(path[k + 1].p_hdr)) break; - path--; - err = ext4_ext_get_access(handle, inode, path); + err = ext4_ext_get_access(handle, inode, path + k); if (err) break; - path->p_idx->ei_block = (path+1)->p_idx->ei_block; - err = ext4_ext_dirty(handle, inode, path); + path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path + k); if (err) break; } From 3e8a584c82cc999b99ea17c31fc2da101201545f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:22 +0800 Subject: [PATCH 61/93] ext4: prevent partial update of the extents path In ext4_ext_rm_idx() and ext4_ext_correct_indexes(), there is no proper rollback of already executed updates when updating a level of the extents path fails, so we may get an inconsistent extents tree, which may trigger some bad things in errors=continue mode. Hence clear the verified bit of modified extents buffers if the tree fails to be updated in ext4_ext_rm_idx() or ext4_ext_correct_indexes(), which forces the extents buffers to be checked in ext4_valid_extent_entries(), ensuring that the extents tree is consistent. Signed-off-by: zhanchengbin Link: https://lore.kernel.org/r/20230213080514.535568-3-zhanchengbin1@huawei.com/ Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-3-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ca08b4036e28..4395e2b668ec 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1749,12 +1749,23 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, break; err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; + goto clean; path[k].p_idx->ei_block = border; err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); return err; } @@ -2312,12 +2323,24 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, break; err = ext4_ext_get_access(handle, inode, path + k); if (err) - break; + goto clean; path[k].p_idx->ei_block = path[k + 1].p_idx->ei_block; err = ext4_ext_dirty(handle, inode, path + k); if (err) - break; + goto clean; } + return 0; + +clean: + /* + * The path[k].p_bh is either unmodified or with no verified bit + * set (see ext4_ext_get_access()). So just clear the verified bit + * of the successfully modified extents buffers, which will force + * these extents to be checked to avoid using inconsistent data. + */ + while (++k < depth) + clear_buffer_verified(path[k].p_bh); + return err; } From c26ab35702f8cd0cdc78f96aa5856bfb77be798f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:23 +0800 Subject: [PATCH 62/93] ext4: fix slab-use-after-free in ext4_split_extent_at() We hit the following use-after-free: ================================================================== BUG: KASAN: slab-use-after-free in ext4_split_extent_at+0xba8/0xcc0 Read of size 2 at addr ffff88810548ed08 by task kworker/u20:0/40 CPU: 0 PID: 40 Comm: kworker/u20:0 Not tainted 6.9.0-dirty #724 Call Trace: kasan_report+0x93/0xc0 ext4_split_extent_at+0xba8/0xcc0 ext4_split_extent.isra.0+0x18f/0x500 ext4_split_convert_extents+0x275/0x750 ext4_ext_handle_unwritten_extents+0x73e/0x1580 ext4_ext_map_blocks+0xe20/0x2dc0 ext4_map_blocks+0x724/0x1700 ext4_do_writepages+0x12d6/0x2a70 [...] Allocated by task 40: __kmalloc_noprof+0x1ac/0x480 ext4_find_extent+0xf3b/0x1e70 ext4_ext_map_blocks+0x188/0x2dc0 ext4_map_blocks+0x724/0x1700 ext4_do_writepages+0x12d6/0x2a70 [...] Freed by task 40: kfree+0xf1/0x2b0 ext4_find_extent+0xa71/0x1e70 ext4_ext_insert_extent+0xa22/0x3260 ext4_split_extent_at+0x3ef/0xcc0 ext4_split_extent.isra.0+0x18f/0x500 ext4_split_convert_extents+0x275/0x750 ext4_ext_handle_unwritten_extents+0x73e/0x1580 ext4_ext_map_blocks+0xe20/0x2dc0 ext4_map_blocks+0x724/0x1700 ext4_do_writepages+0x12d6/0x2a70 [...] ================================================================== The flow of issue triggering is as follows: ext4_split_extent_at path = *ppath ext4_ext_insert_extent(ppath) ext4_ext_create_new_leaf(ppath) ext4_find_extent(orig_path) path = *orig_path read_extent_tree_block // return -ENOMEM or -EIO ext4_free_ext_path(path) kfree(path) *orig_path = NULL a. If err is -ENOMEM: ext4_ext_dirty(path + path->p_depth) // path use-after-free !!! b. If err is -EIO and we have EXT_DEBUG defined: ext4_ext_show_leaf(path) eh = path[depth].p_hdr // path also use-after-free !!! So when trying to zeroout or fix the extent length, call ext4_find_extent() to update the path. In addition we use *ppath directly as an ext4_ext_show_leaf() input to avoid possible use-after-free when EXT_DEBUG is defined, and to avoid unnecessary path updates. Fixes: dfe5080939ea ("ext4: drop EXT4_EX_NOFREE_ON_ERR from rest of extents handling code") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-4-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4395e2b668ec..fe6bca63f9d6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3251,6 +3251,25 @@ static int ext4_split_extent_at(handle_t *handle, if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) goto out; + /* + * Update path is required because previous ext4_ext_insert_extent() + * may have freed or reallocated the path. Using EXT4_EX_NOFAIL + * guarantees that ext4_find_extent() will not return -ENOMEM, + * otherwise -ENOMEM will cause a retry in do_writepages(), and a + * WARN_ON may be triggered in ext4_da_update_reserve_space() due to + * an incorrect ee_len causing the i_reserved_data_blocks exception. + */ + path = ext4_find_extent(inode, ee_block, ppath, + flags | EXT4_EX_NOFAIL); + if (IS_ERR(path)) { + EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", + split, PTR_ERR(path)); + return PTR_ERR(path); + } + depth = ext_depth(inode); + ex = path[depth].p_ext; + *ppath = path; + if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3303,7 +3322,7 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + path->p_depth); return err; out: - ext4_ext_show_leaf(inode, path); + ext4_ext_show_leaf(inode, *ppath); return err; } From 4e2524ba2ca5f54bdbb9e5153bea00421ef653f5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:24 +0800 Subject: [PATCH 63/93] ext4: avoid use-after-free in ext4_ext_show_leaf() In ext4_find_extent(), path may be freed by error or be reallocated, so using a previously saved *ppath may have been freed and thus may trigger use-after-free, as follows: ext4_split_extent path = *ppath; ext4_split_extent_at(ppath) path = ext4_find_extent(ppath) ext4_split_extent_at(ppath) // ext4_find_extent fails to free path // but zeroout succeeds ext4_ext_show_leaf(inode, path) eh = path[depth].p_hdr // path use-after-free !!! Similar to ext4_split_extent_at(), we use *ppath directly as an input to ext4_ext_show_leaf(). Fix a spelling error by the way. Same problem in ext4_ext_handle_unwritten_extents(). Since 'path' is only used in ext4_ext_show_leaf(), remove 'path' and use *ppath directly. This issue is triggered only when EXT_DEBUG is defined and therefore does not affect functionality. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-5-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fe6bca63f9d6..caef91356557 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3327,7 +3327,7 @@ out: } /* - * ext4_split_extents() splits an extent and mark extent which is covered + * ext4_split_extent() splits an extent and mark extent which is covered * by @map as split_flags indicates * * It may result in splitting the extent into multiple extents (up to three) @@ -3403,7 +3403,7 @@ static int ext4_split_extent(handle_t *handle, goto out; } - ext4_ext_show_leaf(inode, path); + ext4_ext_show_leaf(inode, *ppath); out: return err ? err : allocated; } @@ -3868,14 +3868,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { - struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, allocated); - ext4_ext_show_leaf(inode, path); + ext4_ext_show_leaf(inode, *ppath); /* * When writing into unwritten space, we should not fail to @@ -3972,7 +3971,7 @@ out1: if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; - ext4_ext_show_leaf(inode, path); + ext4_ext_show_leaf(inode, *ppath); out2: return err ? err : allocated; } From 5b4b2dcace35f618fe361a87bae6f0d13af31bc1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:25 +0800 Subject: [PATCH 64/93] ext4: update orig_path in ext4_find_extent() In ext4_find_extent(), if the path is not big enough, we free it and set *orig_path to NULL. But after reallocating and successfully initializing the path, we don't update *orig_path, in which case the caller gets a valid path but a NULL ppath, and this may cause a NULL pointer dereference or a path memory leak. For example: ext4_split_extent path = *ppath = 2000 ext4_find_extent if (depth > path[0].p_maxdepth) kfree(path = 2000); *orig_path = path = NULL; path = kcalloc() = 3000 ext4_split_extent_at(*ppath = NULL) path = *ppath; ex = path[depth].p_ext; // NULL pointer dereference! ================================================================== BUG: kernel NULL pointer dereference, address: 0000000000000010 CPU: 6 UID: 0 PID: 576 Comm: fsstress Not tainted 6.11.0-rc2-dirty #847 RIP: 0010:ext4_split_extent_at+0x6d/0x560 Call Trace: ext4_split_extent.isra.0+0xcb/0x1b0 ext4_ext_convert_to_initialized+0x168/0x6c0 ext4_ext_handle_unwritten_extents+0x325/0x4d0 ext4_ext_map_blocks+0x520/0xdb0 ext4_map_blocks+0x2b0/0x690 ext4_iomap_begin+0x20e/0x2c0 [...] ================================================================== Therefore, *orig_path is updated when the extent lookup succeeds, so that the caller can safely use path or *ppath. Fixes: 10809df84a4d ("ext4: teach ext4_ext_find_extent() to realloc path if necessary") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240822023545.1994557-6-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 ++- fs/ext4/move_extent.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index caef91356557..c8e7676f6af9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -957,6 +957,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_show_path(inode, path); + if (orig_path) + *orig_path = path; return path; err: @@ -3268,7 +3270,6 @@ static int ext4_split_extent_at(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; - *ppath = path; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 516897b0218e..228b56f7d047 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -36,7 +36,6 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, *ppath = NULL; return -ENODATA; } - *ppath = path; return 0; } From a164f3a432aae62ca23d03e6d926b122ee5b860d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:26 +0800 Subject: [PATCH 65/93] ext4: aovid use-after-free in ext4_ext_insert_extent() As Ojaswin mentioned in Link, in ext4_ext_insert_extent(), if the path is reallocated in ext4_ext_create_new_leaf(), we'll use the stale path and cause UAF. Below is a sample trace with dummy values: ext4_ext_insert_extent path = *ppath = 2000 ext4_ext_create_new_leaf(ppath) ext4_find_extent(ppath) path = *ppath = 2000 if (depth > path[0].p_maxdepth) kfree(path = 2000); *ppath = path = NULL; path = kcalloc() = 3000 *ppath = 3000; return path; /* here path is still 2000, UAF! */ eh = path[depth].p_hdr ================================================================== BUG: KASAN: slab-use-after-free in ext4_ext_insert_extent+0x26d4/0x3330 Read of size 8 at addr ffff8881027bf7d0 by task kworker/u36:1/179 CPU: 3 UID: 0 PID: 179 Comm: kworker/u6:1 Not tainted 6.11.0-rc2-dirty #866 Call Trace: ext4_ext_insert_extent+0x26d4/0x3330 ext4_ext_map_blocks+0xe22/0x2d40 ext4_map_blocks+0x71e/0x1700 ext4_do_writepages+0x1290/0x2800 [...] Allocated by task 179: ext4_find_extent+0x81c/0x1f70 ext4_ext_map_blocks+0x146/0x2d40 ext4_map_blocks+0x71e/0x1700 ext4_do_writepages+0x1290/0x2800 ext4_writepages+0x26d/0x4e0 do_writepages+0x175/0x700 [...] Freed by task 179: kfree+0xcb/0x240 ext4_find_extent+0x7c0/0x1f70 ext4_ext_insert_extent+0xa26/0x3330 ext4_ext_map_blocks+0xe22/0x2d40 ext4_map_blocks+0x71e/0x1700 ext4_do_writepages+0x1290/0x2800 ext4_writepages+0x26d/0x4e0 do_writepages+0x175/0x700 [...] ================================================================== So use *ppath to update the path to avoid the above problem. Reported-by: Ojaswin Mujoo Closes: https://lore.kernel.org/r/ZqyL6rmtwl6N4MWR@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com Fixes: 10809df84a4d ("ext4: teach ext4_ext_find_extent() to realloc path if necessary") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240822023545.1994557-7-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c8e7676f6af9..b431fcfb6ef8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2116,6 +2116,7 @@ prepend: ppath, newext); if (err) goto cleanup; + path = *ppath; depth = ext_depth(inode); eh = path[depth].p_hdr; From 5c0f4cc84d3a601c99bc5e6e6eb1cbda542cce95 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:27 +0800 Subject: [PATCH 66/93] ext4: drop ppath from ext4_ext_replay_update_ex() to avoid double-free When calling ext4_force_split_extent_at() in ext4_ext_replay_update_ex(), the 'ppath' is updated but it is the 'path' that is freed, thus potentially triggering a double-free in the following process: ext4_ext_replay_update_ex ppath = path ext4_force_split_extent_at(&ppath) ext4_split_extent_at ext4_ext_insert_extent ext4_ext_create_new_leaf ext4_ext_grow_indepth ext4_find_extent if (depth > path[0].p_maxdepth) kfree(path) ---> path First freed *orig_path = path = NULL ---> null ppath kfree(path) ---> path double-free !!! So drop the unnecessary ppath and use path directly to avoid this problem. And use ext4_find_extent() directly to update path, avoiding unnecessary memory allocation and freeing. Also, propagate the error returned by ext4_find_extent() instead of using strange error codes. Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-8-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b431fcfb6ef8..b8dd2fdd90d9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5885,7 +5885,7 @@ out: int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk) { - struct ext4_ext_path *path = NULL, *ppath; + struct ext4_ext_path *path; struct ext4_extent *ex; int ret; @@ -5901,30 +5901,29 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, if (le32_to_cpu(ex->ee_block) != start || ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ - ppath = path; down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1); + ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; - kfree(path); - path = ext4_find_extent(inode, start, NULL, 0); + + path = ext4_find_extent(inode, start, &path, 0); if (IS_ERR(path)) - return -1; - ppath = path; + return PTR_ERR(path); ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); + if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &ppath, + ret = ext4_force_split_extent_at(NULL, inode, &path, start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; - kfree(path); - path = ext4_find_extent(inode, start, NULL, 0); + + path = ext4_find_extent(inode, start, &path, 0); if (IS_ERR(path)) - return -EINVAL; + return PTR_ERR(path); ex = path[path->p_depth].p_ext; } } From dcaa6c31134c0f515600111c38ed7750003e1b9c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:28 +0800 Subject: [PATCH 67/93] ext4: fix double brelse() the buffer of the extents path In ext4_ext_try_to_merge_up(), set path[1].p_bh to NULL after it has been released, otherwise it may be released twice. An example of what triggers this is as follows: split2 map split1 |--------|-------|--------| ext4_ext_map_blocks ext4_ext_handle_unwritten_extents ext4_split_convert_extents // path->p_depth == 0 ext4_split_extent // 1. do split1 ext4_split_extent_at |ext4_ext_insert_extent | ext4_ext_create_new_leaf | ext4_ext_grow_indepth | le16_add_cpu(&neh->eh_depth, 1) | ext4_find_extent | // return -ENOMEM |// get error and try zeroout |path = ext4_find_extent | path->p_depth = 1 |ext4_ext_try_to_merge | ext4_ext_try_to_merge_up | path->p_depth = 0 | brelse(path[1].p_bh) ---> not set to NULL here |// zeroout success // 2. update path ext4_find_extent // 3. do split2 ext4_split_extent_at ext4_ext_insert_extent ext4_ext_create_new_leaf ext4_ext_grow_indepth le16_add_cpu(&neh->eh_depth, 1) ext4_find_extent path[0].p_bh = NULL; path->p_depth = 1 read_extent_tree_block ---> return err // path[1].p_bh is still the old value ext4_free_ext_path ext4_ext_drop_refs // path->p_depth == 1 brelse(path[1].p_bh) ---> brelse a buffer twice Finally got the following WARRNING when removing the buffer from lru: ============================================ VFS: brelse: Trying to free free buffer WARNING: CPU: 2 PID: 72 at fs/buffer.c:1241 __brelse+0x58/0x90 CPU: 2 PID: 72 Comm: kworker/u19:1 Not tainted 6.9.0-dirty #716 RIP: 0010:__brelse+0x58/0x90 Call Trace: __find_get_block+0x6e7/0x810 bdev_getblk+0x2b/0x480 __ext4_get_inode_loc+0x48a/0x1240 ext4_get_inode_loc+0xb2/0x150 ext4_reserve_inode_write+0xb7/0x230 __ext4_mark_inode_dirty+0x144/0x6a0 ext4_ext_insert_extent+0x9c8/0x3230 ext4_ext_map_blocks+0xf45/0x2dc0 ext4_map_blocks+0x724/0x1700 ext4_do_writepages+0x12d6/0x2a70 [...] ============================================ Fixes: ecb94f5fdf4b ("ext4: collapse a single extent tree block into the inode if possible") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-9-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b8dd2fdd90d9..7536ab837c3f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1890,6 +1890,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, path[0].p_hdr->eh_max = cpu_to_le16(max_root); brelse(path[1].p_bh); + path[1].p_bh = NULL; ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } From 6c2b3246cd0b951338c54b10f8ca13c59a41c86a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:29 +0800 Subject: [PATCH 68/93] ext4: add new ext4_ext_path_brelse() helper Add ext4_ext_path_brelse() helper function to reduce duplicate code and ensure that path->p_bh is set to NULL after it is released. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-10-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7536ab837c3f..94068f1a0d85 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -106,6 +106,12 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) return 0; } +static inline void ext4_ext_path_brelse(struct ext4_ext_path *path) +{ + brelse(path->p_bh); + path->p_bh = NULL; +} + static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; @@ -113,10 +119,8 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path) if (!path) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) { - brelse(path->p_bh); - path->p_bh = NULL; - } + for (i = 0; i <= depth; i++, path++) + ext4_ext_path_brelse(path); } void ext4_free_ext_path(struct ext4_ext_path *path) @@ -635,8 +639,7 @@ int ext4_ext_precache(struct inode *inode) */ if ((i == depth) || path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -1889,8 +1892,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); path[0].p_hdr->eh_max = cpu_to_le16(max_root); - brelse(path[1].p_bh); - path[1].p_bh = NULL; + ext4_ext_path_brelse(path + 1); ext4_free_blocks(handle, inode, NULL, blk, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } @@ -2959,8 +2961,7 @@ again: err = ext4_ext_rm_leaf(handle, inode, path, &partial, start, end); /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; continue; } @@ -3022,8 +3023,7 @@ again: err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ - brelse(path[i].p_bh); - path[i].p_bh = NULL; + ext4_ext_path_brelse(path + i); i--; ext_debug(inode, "return to level %d\n", i); } From 369c944ed1d7c3fb7b35f24e4735761153afe7b3 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:30 +0800 Subject: [PATCH 69/93] ext4: propagate errors from ext4_find_extent() in ext4_insert_range() Even though ext4_find_extent() returns an error, ext4_insert_range() still returns 0. This may confuse the user as to why fallocate returns success, but the contents of the file are not as expected. So propagate the error returned by ext4_find_extent() to avoid inconsistencies. Fixes: 331573febb6a ("ext4: Add support FALLOC_FL_INSERT_RANGE for fallocate") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-11-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 94068f1a0d85..e2f89ca3e138 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5541,6 +5541,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) path = ext4_find_extent(inode, offset_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); + ret = PTR_ERR(path); goto out_stop; } From 0be4c0c2f17bd10ae16c852f02d51a6a7b318aca Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:31 +0800 Subject: [PATCH 70/93] ext4: get rid of ppath in ext4_find_extent() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. Getting rid of ppath in ext4_find_extent() requires its caller to update ppath. These ppaths will also be dropped later. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-12-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 55 +++++++++++++++++++++++-------------------- fs/ext4/move_extent.c | 7 +++--- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7c3438b431e4..93b15b182797 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3714,7 +3714,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path **, + struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2f89ca3e138..c97e7a7e1476 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -884,11 +884,10 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path **orig_path, int flags) + struct ext4_ext_path *path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; gfp_t gfp_flags = GFP_NOFS; @@ -909,7 +908,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_drop_refs(path); if (depth > path[0].p_maxdepth) { kfree(path); - *orig_path = path = NULL; + path = NULL; } } if (!path) { @@ -960,14 +959,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_ext_show_path(inode, path); - if (orig_path) - *orig_path = path; return path; err: ext4_free_ext_path(path); - if (orig_path) - *orig_path = NULL; return ERR_PTR(ret); } @@ -1432,7 +1427,7 @@ repeat: /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); + path, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { @@ -1444,7 +1439,7 @@ repeat: /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - ppath, gb_flags); + path, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1460,8 +1455,8 @@ repeat: goto repeat; } } - out: + *ppath = IS_ERR(path) ? NULL : path; return err; } @@ -3263,15 +3258,17 @@ static int ext4_split_extent_at(handle_t *handle, * WARN_ON may be triggered in ext4_da_update_reserve_space() due to * an incorrect ee_len causing the i_reserved_data_blocks exception. */ - path = ext4_find_extent(inode, ee_block, ppath, + path = ext4_find_extent(inode, ee_block, *ppath, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); + *ppath = NULL; return PTR_ERR(path); } depth = ext_depth(inode); ex = path[depth].p_ext; + *ppath = path; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { @@ -3381,9 +3378,12 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_find_extent(inode, map->m_lblk, ppath, flags); - if (IS_ERR(path)) + path = ext4_find_extent(inode, map->m_lblk, *ppath, flags); + if (IS_ERR(path)) { + *ppath = NULL; return PTR_ERR(path); + } + *ppath = path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { @@ -3769,9 +3769,12 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); - if (IS_ERR(path)) + path = ext4_find_extent(inode, map->m_lblk, *ppath, 0); + if (IS_ERR(path)) { + *ppath = NULL; return PTR_ERR(path); + } + *ppath = path; depth = ext_depth(inode); ex = path[depth].p_ext; } @@ -3827,9 +3830,12 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); - if (IS_ERR(path)) + path = ext4_find_extent(inode, map->m_lblk, *ppath, 0); + if (IS_ERR(path)) { + *ppath = NULL; return PTR_ERR(path); + } + *ppath = path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { @@ -5190,7 +5196,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * won't be shifted beyond EXT_MAX_BLOCKS. */ if (SHIFT == SHIFT_LEFT) { - path = ext4_find_extent(inode, start - 1, &path, + path = ext4_find_extent(inode, start - 1, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5239,7 +5245,7 @@ again: * becomes NULL to indicate the end of the loop. */ while (iterator && start <= stop) { - path = ext4_find_extent(inode, *iterator, &path, + path = ext4_find_extent(inode, *iterator, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); @@ -5821,11 +5827,8 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu) /* search for the extent closest to the first block in the cluster */ path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - path = NULL; - goto out; - } + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); @@ -5909,7 +5912,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, if (ret) goto out; - path = ext4_find_extent(inode, start, &path, 0); + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; @@ -5923,7 +5926,7 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, if (ret) goto out; - path = ext4_find_extent(inode, start, &path, 0); + path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 228b56f7d047..d3e61f1be046 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -26,16 +26,17 @@ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, struct ext4_ext_path **ppath) { - struct ext4_ext_path *path; + struct ext4_ext_path *path = *ppath; - path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); + *ppath = NULL; + path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); - *ppath = NULL; return -ENODATA; } + *ppath = path; return 0; } From 6b854d552711aa33f59eda334e6d94a00d8825bb Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:32 +0800 Subject: [PATCH 71/93] ext4: get rid of ppath in get_ext_path() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. After getting rid of ppath in get_ext_path(), its caller may pass an error pointer to ext4_free_ext_path(), so it needs to teach ext4_free_ext_path() and ext4_ext_drop_refs() to skip the error pointer. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-13-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +++-- fs/ext4/move_extent.c | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c97e7a7e1476..575e674219a8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -116,7 +116,7 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path) { int depth, i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; depth = path->p_depth; for (i = 0; i <= depth; i++, path++) @@ -125,6 +125,8 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path) void ext4_free_ext_path(struct ext4_ext_path *path) { + if (IS_ERR_OR_NULL(path)) + return; ext4_ext_drop_refs(path); kfree(path); } @@ -4193,7 +4195,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - path = NULL; goto out; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d3e61f1be046..b64661ea6e0e 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -17,27 +17,23 @@ * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path - * @ppath: pointer to an extent path pointer (for output) + * @path: pointer to an extent path * - * ext4_find_extent wrapper. Return 0 on success, or a negative error value - * on failure. + * ext4_find_extent wrapper. Return an extent path pointer on success, + * or an error pointer on failure. */ -static inline int +static inline struct ext4_ext_path * get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **ppath) + struct ext4_ext_path *path) { - struct ext4_ext_path *path = *ppath; - - *ppath = NULL; path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) - return PTR_ERR(path); + return path; if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); - return -ENODATA; + return ERR_PTR(-ENODATA); } - *ppath = path; - return 0; + return path; } /** @@ -95,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int ret = 0; ext4_lblk_t last = from + count; while (from < last) { - *err = get_ext_path(inode, from, &path); - if (*err) - goto out; + path = get_ext_path(inode, from, path); + if (IS_ERR(path)) { + *err = PTR_ERR(path); + return ret; + } ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; @@ -636,9 +634,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, int offset_in_page; int unwritten, cur_len; - ret = get_ext_path(orig_inode, o_start, &path); - if (ret) + path = get_ext_path(orig_inode, o_start, path); + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); From a000bc8678cc2bb10a5b80b4e991e77c7b4612fd Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:33 +0800 Subject: [PATCH 72/93] ext4: get rid of ppath in ext4_ext_create_new_leaf() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_ext_create_new_leaf(), the following is done here: * Free the extents path when an error is encountered. * Its caller needs to update ppath if it uses ppath. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-14-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 575e674219a8..8610a22f4f5a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1397,13 +1397,12 @@ out: * finds empty index and adds new leaf. * if no free index is found, then it requests in-depth growing. */ -static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, - unsigned int mb_flags, - unsigned int gb_flags, - struct ext4_ext_path **ppath, - struct ext4_extent *newext) +static struct ext4_ext_path * +ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, + unsigned int mb_flags, unsigned int gb_flags, + struct ext4_ext_path *path, + struct ext4_extent *newext) { - struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1424,28 +1423,25 @@ repeat: * entry: create all needed subtree and add new leaf */ err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); if (err) - goto out; + goto errout; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), path, gb_flags); - if (IS_ERR(path)) - err = PTR_ERR(path); + return path; } else { /* tree is full, time to grow in depth */ err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) - goto out; + goto errout; /* refill path */ path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), path, gb_flags); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + if (IS_ERR(path)) + return path; /* * only first (depth 0 -> 1) produces free space; @@ -1457,9 +1453,11 @@ repeat: goto repeat; } } -out: - *ppath = IS_ERR(path) ? NULL : path; - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -2112,11 +2110,14 @@ prepend: */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags |= EXT4_MB_USE_RESERVED; - err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - ppath, newext); - if (err) + path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, + path, newext); + if (IS_ERR(path)) { + *ppath = NULL; + err = PTR_ERR(path); goto cleanup; - path = *ppath; + } + *ppath = path; depth = ext_depth(inode); eh = path[depth].p_hdr; From f7d1331f16a869c76a5102caebb58e840e1d509c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:34 +0800 Subject: [PATCH 73/93] ext4: get rid of ppath in ext4_ext_insert_extent() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_ext_insert_extent(), the following is done here: * Free the extents path when an error is encountered. * Its caller needs to update ppath if it uses ppath. * Free path when npath is used, free npath when it is not used. * The got_allocated_blocks label in ext4_ext_map_blocks() does not update err now, so err is updated to 0 if the err returned by ext4_ext_search_right() is greater than 0 and is about to enter got_allocated_blocks. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-15-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ++-- fs/ext4/extents.c | 88 ++++++++++++++++++++++++------------------- fs/ext4/fast_commit.c | 8 ++-- fs/ext4/migrate.c | 5 ++- 4 files changed, 61 insertions(+), 47 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 93b15b182797..481ece3660eb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3710,9 +3710,10 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path **, - struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_insert_extent( + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *, int flags); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8610a22f4f5a..632f0a1a00b3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1975,16 +1975,15 @@ out: * inserts requested extent as new one into the tree, * creating new leaf in the no-space case. */ -int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_extent *newext, int gb_flags) +struct ext4_ext_path * +ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int gb_flags) { - struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ - struct ext4_ext_path *npath = NULL; - int depth, len, err; + int depth, len, err = 0; ext4_lblk_t next; int mb_flags = 0, unwritten; @@ -1992,14 +1991,16 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } /* try to insert block into found extent and return */ @@ -2037,7 +2038,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); @@ -2062,7 +2063,7 @@ prepend: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; unwritten = ext4_ext_is_unwritten(ex); ex->ee_block = newext->ee_block; @@ -2087,21 +2088,26 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { + struct ext4_ext_path *npath; + ext_debug(inode, "next leaf block - %u\n", next); - BUG_ON(npath != NULL); npath = ext4_find_extent(inode, next, NULL, gb_flags); - if (IS_ERR(npath)) - return PTR_ERR(npath); + if (IS_ERR(npath)) { + err = PTR_ERR(npath); + goto errout; + } BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); + ext4_free_ext_path(path); path = npath; goto has_space; } ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); + ext4_free_ext_path(npath); } /* @@ -2112,12 +2118,8 @@ prepend: mb_flags |= EXT4_MB_USE_RESERVED; path = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, path, newext); - if (IS_ERR(path)) { - *ppath = NULL; - err = PTR_ERR(path); - goto cleanup; - } - *ppath = path; + if (IS_ERR(path)) + return path; depth = ext_depth(inode); eh = path[depth].p_hdr; @@ -2126,7 +2128,7 @@ has_space: err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto cleanup; + goto errout; if (!nearex) { /* there is no extent in this leaf, create first one */ @@ -2184,17 +2186,20 @@ merge: if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) ext4_ext_try_to_merge(handle, inode, path, nearex); - /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); if (err) - goto cleanup; + goto errout; err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto errout; -cleanup: - ext4_free_ext_path(npath); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } static int ext4_fill_es_cache_info(struct inode *inode, @@ -3249,24 +3254,29 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); - if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (!IS_ERR(path)) { + *ppath = path; goto out; + } + *ppath = NULL; + err = PTR_ERR(path); + if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + return err; /* - * Update path is required because previous ext4_ext_insert_extent() - * may have freed or reallocated the path. Using EXT4_EX_NOFAIL - * guarantees that ext4_find_extent() will not return -ENOMEM, - * otherwise -ENOMEM will cause a retry in do_writepages(), and a - * WARN_ON may be triggered in ext4_da_update_reserve_space() due to - * an incorrect ee_len causing the i_reserved_data_blocks exception. + * Get a new path to try to zeroout or fix the extent length. + * Using EXT4_EX_NOFAIL guarantees that ext4_find_extent() + * will not return -ENOMEM, otherwise -ENOMEM will cause a + * retry in do_writepages(), and a WARN_ON may be triggered + * in ext4_da_update_reserve_space() due to an incorrect + * ee_len causing the i_reserved_data_blocks exception. */ - path = ext4_find_extent(inode, ee_block, *ppath, + path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); - *ppath = NULL; return PTR_ERR(path); } depth = ext_depth(inode); @@ -3325,7 +3335,7 @@ fix_extent_len: ext4_ext_dirty(handle, inode, path + path->p_depth); return err; out: - ext4_ext_show_leaf(inode, *ppath); + ext4_ext_show_leaf(inode, path); return err; } @@ -4315,6 +4325,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; + err = 0; goto got_allocated_blocks; } @@ -4387,8 +4398,9 @@ got_allocated_blocks: map->m_flags |= EXT4_MAP_UNWRITTEN; } - err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); - if (err) { + path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (IS_ERR(path)) { + err = PTR_ERR(path); if (allocated_clusters) { int fb_flags = 0; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 326c16a4e51e..afcb2e5cdac7 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1803,12 +1803,12 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_ext_insert_extent( - NULL, inode, &path, &newex, 0); + path = ext4_ext_insert_extent(NULL, inode, + path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); - ext4_free_ext_path(path); - if (ret) + if (IS_ERR(path)) goto out; + ext4_free_ext_path(path); goto next; } diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d98ac2af8199..0f68b8a14560 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode, path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); - path = NULL; goto err_out; } @@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; - retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); + path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + if (IS_ERR(path)) + retval = PTR_ERR(path); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); From 1de82b1b60d4613753254bf3cbf622a4c02c945c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:35 +0800 Subject: [PATCH 74/93] ext4: get rid of ppath in ext4_split_extent_at() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_split_extent_at(), the following is done here: * Free the extents path when an error is encountered. * Its caller needs to update ppath if it uses ppath. * Teach ext4_ext_show_leaf() to skip error pointer. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-16-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 85 ++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 632f0a1a00b3..fcc06fd77af9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -84,12 +84,11 @@ static void ext4_extent_block_csum_set(struct inode *inode, et->et_checksum = ext4_extent_block_csum(inode, eh); } -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags); +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags); static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { @@ -341,9 +340,15 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + path = ext4_split_extent_at(handle, inode, path, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); + if (IS_ERR(path)) { + *ppath = NULL; + return PTR_ERR(path); + } + *ppath = path; + return 0; } static int @@ -694,7 +699,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) struct ext4_extent *ex; int i; - if (!path) + if (IS_ERR_OR_NULL(path)) return; eh = path[depth].p_hdr; @@ -3175,16 +3180,14 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) * a> the extent are splitted into two extent. * b> split is not needed, and just mark the extent. * - * return 0 on success. + * Return an extent path pointer on success, or an error pointer on failure. */ -static int ext4_split_extent_at(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - ext4_lblk_t split, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent_at(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + ext4_lblk_t split, + int split_flag, int flags) { - struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3255,14 +3258,12 @@ static int ext4_split_extent_at(handle_t *handle, ext4_ext_mark_unwritten(ex2); path = ext4_ext_insert_extent(handle, inode, path, &newex, flags); - if (!IS_ERR(path)) { - *ppath = path; + if (!IS_ERR(path)) goto out; - } - *ppath = NULL; + err = PTR_ERR(path); if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) - return err; + return path; /* * Get a new path to try to zeroout or fix the extent length. @@ -3272,16 +3273,14 @@ static int ext4_split_extent_at(handle_t *handle, * in ext4_da_update_reserve_space() due to an incorrect * ee_len causing the i_reserved_data_blocks exception. */ - path = ext4_find_extent(inode, ee_block, NULL, - flags | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, ee_block, NULL, flags | EXT4_EX_NOFAIL); if (IS_ERR(path)) { EXT4_ERROR_INODE(inode, "Failed split extent on %u, err %ld", split, PTR_ERR(path)); - return PTR_ERR(path); + return path; } depth = ext_depth(inode); ex = path[depth].p_ext; - *ppath = path; if (EXT4_EXT_MAY_ZEROOUT & split_flag) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { @@ -3333,10 +3332,13 @@ fix_extent_len: * and err is a non-zero error code. */ ext4_ext_dirty(handle, inode, path + path->p_depth); - return err; out: + if (err) { + ext4_free_ext_path(path); + path = ERR_PTR(err); + } ext4_ext_show_leaf(inode, path); - return err; + return path; } /* @@ -3380,10 +3382,14 @@ static int ext4_split_extent(handle_t *handle, EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); - if (err) + if (IS_ERR(path)) { + err = PTR_ERR(path); + *ppath = NULL; goto out; + } + *ppath = path; } else { allocated = ee_len - (map->m_lblk - ee_block); } @@ -3391,7 +3397,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_find_extent(inode, map->m_lblk, *ppath, flags); + path = ext4_find_extent(inode, map->m_lblk, path, flags); if (IS_ERR(path)) { *ppath = NULL; return PTR_ERR(path); @@ -3413,13 +3419,17 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, ppath, + path = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); - if (err) + if (IS_ERR(path)) { + err = PTR_ERR(path); + *ppath = NULL; goto out; + } + *ppath = path; } - ext4_ext_show_leaf(inode, *ppath); + ext4_ext_show_leaf(inode, path); out: return err ? err : allocated; } @@ -5580,22 +5590,21 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; - ret = ext4_split_extent_at(handle, inode, &path, + path = ext4_split_extent_at(handle, inode, path, offset_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); } - ext4_free_ext_path(path); - if (ret < 0) { + if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); + ret = PTR_ERR(path); goto out_stop; } - } else { - ext4_free_ext_path(path); } + ext4_free_ext_path(path); ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* From f07be1c367369636d7d338d7994473d6eae283c5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:36 +0800 Subject: [PATCH 75/93] ext4: get rid of ppath in ext4_force_split_extent_at() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_force_split_extent_at(), the following is done here: * Free the extents path when an error is encountered. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-17-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 69 ++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index fcc06fd77af9..81bb9a67ec8e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -328,27 +328,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } -static inline int +static inline struct ext4_ext_path * ext4_force_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path **ppath, ext4_lblk_t lblk, + struct ext4_ext_path *path, ext4_lblk_t lblk, int nofail) { - struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; if (nofail) flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; - path = ext4_split_extent_at(handle, inode, path, lblk, unwritten ? + return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, flags); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } - *ppath = path; - return 0; } static int @@ -2907,11 +2900,12 @@ again: * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_force_split_extent_at(handle, inode, &path, - end + 1, 1); - if (err < 0) + path = ext4_force_split_extent_at(handle, inode, path, + end + 1, 1); + if (IS_ERR(path)) { + err = PTR_ERR(path); goto out; - + } } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && partial.state == initial) { /* @@ -5728,17 +5722,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, /* Prepare left boundary */ if (e1_blk < lblk1) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1, 0); - if (unlikely(*erp)) + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); goto finish; + } } if (e2_blk < lblk2) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2, 0); - if (unlikely(*erp)) + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); goto finish; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ @@ -5754,17 +5752,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (len != e1_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode1, - &path1, lblk1 + len, 0); - if (unlikely(*erp)) + path1 = ext4_force_split_extent_at(handle, inode1, + path1, lblk1 + len, 0); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); goto finish; + } } if (len != e2_len) { split = 1; - *erp = ext4_force_split_extent_at(handle, inode2, - &path2, lblk2 + len, 0); - if (*erp) + path2 = ext4_force_split_extent_at(handle, inode2, + path2, lblk2 + len, 0); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); goto finish; + } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ @@ -5930,24 +5932,29 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, ext4_ext_get_actual_len(ex) != len) { /* We need to split this extent to match our extent first */ down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1); + path = ext4_force_split_extent_at(NULL, inode, path, start, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) return PTR_ERR(path); + ex = path[path->p_depth].p_ext; WARN_ON(le32_to_cpu(ex->ee_block) != start); if (ext4_ext_get_actual_len(ex) != len) { down_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_force_split_extent_at(NULL, inode, &path, - start + len, 1); + path = ext4_force_split_extent_at(NULL, inode, path, + start + len, 1); up_write(&EXT4_I(inode)->i_data_sem); - if (ret) + if (IS_ERR(path)) { + ret = PTR_ERR(path); goto out; + } path = ext4_find_extent(inode, start, path, 0); if (IS_ERR(path)) From f74cde045617cc275c848c9692feac249ff7a3e7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:37 +0800 Subject: [PATCH 76/93] ext4: get rid of ppath in ext4_split_extent() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_split_extent(), the following is done here: * The 'allocated' is changed from passing a value to passing an address. * Its caller needs to update ppath if it uses ppath. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-18-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 97 ++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 81bb9a67ec8e..c393fcfd038e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3346,21 +3346,18 @@ out: * c> Splits in three extents: Somone is splitting in middle of the extent * */ -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_map_blocks *map, - int split_flag, - int flags) +static struct ext4_ext_path *ext4_split_extent(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_map_blocks *map, + int split_flag, int flags, + unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; - int err = 0; int unwritten; int split_flag1, flags1; - int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3378,33 +3375,25 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= EXT4_EXT_DATA_VALID1; path = ext4_split_extent_at(handle, inode, path, map->m_lblk + map->m_len, split_flag1, flags1); - if (IS_ERR(path)) { - err = PTR_ERR(path); - *ppath = NULL; - goto out; + if (IS_ERR(path)) + return path; + /* + * Update path is required because previous ext4_split_extent_at + * may result in split of original leaf or extent zeroout. + */ + path = ext4_find_extent(inode, map->m_lblk, path, flags); + if (IS_ERR(path)) + return path; + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + ext4_free_ext_path(path); + return ERR_PTR(-EFSCORRUPTED); } - *ppath = path; - } else { - allocated = ee_len - (map->m_lblk - ee_block); + unwritten = ext4_ext_is_unwritten(ex); } - /* - * Update path is required because previous ext4_split_extent_at() may - * result in split of original leaf or extent zeroout. - */ - path = ext4_find_extent(inode, map->m_lblk, path, flags); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } - *ppath = path; - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - return -EFSCORRUPTED; - } - unwritten = ext4_ext_is_unwritten(ex); if (map->m_lblk >= ee_block) { split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; @@ -3415,17 +3404,18 @@ static int ext4_split_extent(handle_t *handle, } path = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); - if (IS_ERR(path)) { - err = PTR_ERR(path); - *ppath = NULL; - goto out; - } - *ppath = path; + if (IS_ERR(path)) + return path; } + if (allocated) { + if (map->m_lblk + map->m_len > ee_block + ee_len) + *allocated = ee_len - (map->m_lblk - ee_block); + else + *allocated = map->m_len; + } ext4_ext_show_leaf(inode, path); -out: - return err ? err : allocated; + return path; } /* @@ -3670,10 +3660,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } fallback: - err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, - flags); - if (err > 0) - err = 0; + path = ext4_split_extent(handle, inode, path, &split_map, split_flag, + flags, NULL); + if (IS_ERR(path)) { + err = PTR_ERR(path); + *ppath = NULL; + goto out; + } + err = 0; + *ppath = path; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { @@ -3719,6 +3714,7 @@ static int ext4_split_convert_extents(handle_t *handle, struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; + unsigned int allocated = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); @@ -3746,7 +3742,14 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); + path = ext4_split_extent(handle, inode, path, map, split_flag, flags, + &allocated); + if (IS_ERR(path)) { + *ppath = NULL; + return PTR_ERR(path); + } + *ppath = path; + return allocated; } static int ext4_convert_unwritten_extents_endio(handle_t *handle, From 225057b1af381567ffa4eb813f4a28a5c38a25cf Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:38 +0800 Subject: [PATCH 77/93] ext4: get rid of ppath in ext4_split_convert_extents() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_split_convert_extents(), the following is done here: * Its caller needs to update ppath if it uses ppath. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-19-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 65 ++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c393fcfd038e..6df5140d58ff 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3700,21 +3700,21 @@ out: * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). * - * Returns the size of unwritten extent to be written on success. + * The size of unwritten extent to be written is passed to the caller via the + * allocated pointer. Return an extent path pointer on success, or an error + * pointer on failure. */ -static int ext4_split_convert_extents(handle_t *handle, +static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) + struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len; int split_flag = 0, depth; - unsigned int allocated = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); @@ -3742,14 +3742,8 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - path = ext4_split_extent(handle, inode, path, map, split_flag, flags, - &allocated); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } - *ppath = path; - return allocated; + return ext4_split_extent(handle, inode, path, map, split_flag, flags, + allocated); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, @@ -3785,11 +3779,14 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, *ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT, NULL); + if (IS_ERR(path)) { + *ppath = NULL; + return PTR_ERR(path); + } + + path = ext4_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) { *ppath = NULL; return PTR_ERR(path); @@ -3846,11 +3843,14 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, ppath, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - return err; - path = ext4_find_extent(inode, map->m_lblk, *ppath, 0); + path = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); + if (IS_ERR(path)) { + *ppath = NULL; + return PTR_ERR(path); + } + + path = ext4_find_extent(inode, map->m_lblk, path, 0); if (IS_ERR(path)) { *ppath = NULL; return PTR_ERR(path); @@ -3916,19 +3916,20 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, ppath, - flags | EXT4_GET_BLOCKS_CONVERT); - if (ret < 0) { - err = ret; + *ppath = ext4_split_convert_extents(handle, inode, map, *ppath, + flags | EXT4_GET_BLOCKS_CONVERT, &allocated); + if (IS_ERR(*ppath)) { + err = PTR_ERR(*ppath); + *ppath = NULL; goto out2; } /* - * shouldn't get a 0 return when splitting an extent unless + * shouldn't get a 0 allocated when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { + if (unlikely(allocated == 0)) { EXT4_ERROR_INODE(inode, - "unexpected ret == 0, m_len = %u", + "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; @@ -3989,9 +3990,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, err = -EFSCORRUPTED; goto out2; } + allocated = ret; out: - allocated = ret; map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; From 8d5ad7b08f9234bc92b9567cfe52e521df5f6626 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:39 +0800 Subject: [PATCH 78/93] ext4: get rid of ppath in ext4_convert_unwritten_extents_endio() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_convert_unwritten_extents_endio(), the following is done here: * Free the extents path when an error is encountered. * Its caller needs to update ppath if it uses ppath. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-20-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6df5140d58ff..f1ea59cd9a16 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3746,12 +3746,11 @@ static struct ext4_ext_path *ext4_split_convert_extents(handle_t *handle, allocated); } -static int ext4_convert_unwritten_extents_endio(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath) +static struct ext4_ext_path * +ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3781,24 +3780,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, #endif path = ext4_split_convert_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT, NULL); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } + if (IS_ERR(path)) + return path; path = ext4_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } - *ppath = path; + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; /* first mark the extent as initialized */ ext4_ext_mark_initialized(ex); @@ -3809,9 +3803,15 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: + if (err) + goto errout; + ext4_ext_show_leaf(inode, path); - return err; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } static int @@ -3939,10 +3939,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - err = ext4_convert_unwritten_extents_endio(handle, inode, map, - ppath); - if (err < 0) + *ppath = ext4_convert_unwritten_extents_endio(handle, inode, + map, *ppath); + if (IS_ERR(*ppath)) { + err = PTR_ERR(*ppath); + *ppath = NULL; goto out2; + } ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } From 33c14b8bd8a9ef8b3dfde136b0ca779e68c2f576 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:40 +0800 Subject: [PATCH 79/93] ext4: get rid of ppath in ext4_ext_convert_to_initialized() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_ext_convert_to_initialized(), the following is done here: * Free the extents path when an error is encountered. * Its caller needs to update ppath if it uses ppath. * The 'allocated' is changed from passing a value to passing an address. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-21-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 73 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f1ea59cd9a16..ac480d76444c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3438,13 +3438,11 @@ static struct ext4_ext_path *ext4_split_extent(handle_t *handle, * that are allocated and initialized. * It is guaranteed to be >= map->m_len. */ -static int ext4_ext_convert_to_initialized(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, - int flags) +static struct ext4_ext_path * +ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, struct ext4_ext_path *path, + int flags, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3454,7 +3452,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, unsigned int ee_len, depth, map_len = map->m_len; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; - int allocated = 0; unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", @@ -3495,6 +3492,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, * - L2: we only attempt to merge with an extent stored in the * same extent tree node. */ + *allocated = 0; if ((map->m_lblk == ee_block) && /* See if we can merge left */ (map_len < ee_len) && /*L1*/ @@ -3524,7 +3522,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3539,7 +3537,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(prev_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && (map_len < ee_len) && /*L1*/ @@ -3570,7 +3568,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ err = ext4_ext_get_access(handle, inode, path + depth); if (err) - goto out; + goto errout; trace_ext4_ext_convert_to_initialized_fastpath(inode, map, ex, abut_ex); @@ -3585,18 +3583,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, abut_ex->ee_len = cpu_to_le16(next_len + map_len); /* Result: number of initialized blocks past m_lblk */ - allocated = map_len; + *allocated = map_len; } } - if (allocated) { + if (*allocated) { /* Mark the block containing both extents as dirty */ err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; + if (err) + goto errout; goto out; } else - allocated = ee_len - (map->m_lblk - ee_block); + *allocated = ee_len - (map->m_lblk - ee_block); WARN_ON(map->m_lblk < ee_block); /* @@ -3623,21 +3623,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_lblk = map->m_lblk; split_map.m_len = map->m_len; - if (max_zeroout && (allocated > split_map.m_len)) { - if (allocated <= max_zeroout) { + if (max_zeroout && (*allocated > split_map.m_len)) { + if (*allocated <= max_zeroout) { /* case 3 or 5 */ zero_ex1.ee_block = cpu_to_le32(split_map.m_lblk + split_map.m_len); zero_ex1.ee_len = - cpu_to_le16(allocated - split_map.m_len); + cpu_to_le16(*allocated - split_map.m_len); ext4_ext_store_pblock(&zero_ex1, ext4_ext_pblock(ex) + split_map.m_lblk + split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) goto fallback; - split_map.m_len = allocated; + split_map.m_len = *allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < max_zeroout) { @@ -3655,27 +3655,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len += split_map.m_lblk - ee_block; split_map.m_lblk = ee_block; - allocated = map->m_len; + *allocated = map->m_len; } } fallback: path = ext4_split_extent(handle, inode, path, &split_map, split_flag, flags, NULL); - if (IS_ERR(path)) { - err = PTR_ERR(path); - *ppath = NULL; - goto out; - } - err = 0; - *ppath = path; + if (IS_ERR(path)) + return path; out: /* If we have gotten a failure, don't zero out status tree */ - if (!err) { - ext4_zeroout_es(inode, &zero_ex1); - ext4_zeroout_es(inode, &zero_ex2); - } - return err ? err : allocated; + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -3897,7 +3894,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { - int ret = 0; int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", @@ -3977,23 +3973,24 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ - ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret < 0) { - err = ret; + *ppath = ext4_ext_convert_to_initialized(handle, inode, map, *ppath, + flags, &allocated); + if (IS_ERR(*ppath)) { + err = PTR_ERR(*ppath); + *ppath = NULL; goto out2; } ext4_update_inode_fsync_trans(handle, inode, 1); /* - * shouldn't get a 0 return when converting an unwritten extent + * shouldn't get a 0 allocated when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(ret == 0)) { - EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + if (unlikely(allocated == 0)) { + EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; goto out2; } - allocated = ret; out: map->m_flags |= EXT4_MAP_NEW; From 2ec2e1043473b3d4a3afbe6ad7c5a5b7a6fdf480 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:41 +0800 Subject: [PATCH 80/93] ext4: get rid of ppath in ext4_ext_handle_unwritten_extents() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in ext4_ext_handle_unwritten_extents(), the following is done here: * Free the extents path when an error is encountered. * The 'allocated' is changed from passing a value to passing an address. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-22-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 82 +++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac480d76444c..233afd884abc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3888,18 +3888,18 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, return 0; } -static int +static struct ext4_ext_path * ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + struct ext4_ext_path *path, int flags, + unsigned int *allocated, ext4_fsblk_t newblock) { int err = 0; ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", (unsigned long long)map->m_lblk, map->m_len, flags, - allocated); - ext4_ext_show_leaf(inode, *ppath); + *allocated); + ext4_ext_show_leaf(inode, path); /* * When writing into unwritten space, we should not fail to @@ -3908,40 +3908,34 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; trace_ext4_ext_handle_unwritten_extents(inode, map, flags, - allocated, newblock); + *allocated, newblock); /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - *ppath = ext4_split_convert_extents(handle, inode, map, *ppath, - flags | EXT4_GET_BLOCKS_CONVERT, &allocated); - if (IS_ERR(*ppath)) { - err = PTR_ERR(*ppath); - *ppath = NULL; - goto out2; - } + path = ext4_split_convert_extents(handle, inode, map, path, + flags | EXT4_GET_BLOCKS_CONVERT, allocated); + if (IS_ERR(path)) + return path; /* * shouldn't get a 0 allocated when splitting an extent unless * m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(allocated == 0)) { + if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - *ppath = ext4_convert_unwritten_extents_endio(handle, inode, - map, *ppath); - if (IS_ERR(*ppath)) { - err = PTR_ERR(*ppath); - *ppath = NULL; - goto out2; - } + path = ext4_convert_unwritten_extents_endio(handle, inode, + map, path); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); goto map_out; } @@ -3973,23 +3967,20 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, * For buffered writes, at writepage time, etc. Convert a * discovered unwritten extent to written. */ - *ppath = ext4_ext_convert_to_initialized(handle, inode, map, *ppath, - flags, &allocated); - if (IS_ERR(*ppath)) { - err = PTR_ERR(*ppath); - *ppath = NULL; - goto out2; - } + path = ext4_ext_convert_to_initialized(handle, inode, map, path, + flags, allocated); + if (IS_ERR(path)) + return path; ext4_update_inode_fsync_trans(handle, inode, 1); /* * shouldn't get a 0 allocated when converting an unwritten extent * unless m_len is 0 (bug) or extent has been corrupted */ - if (unlikely(allocated == 0)) { + if (unlikely(*allocated == 0)) { EXT4_ERROR_INODE(inode, "unexpected allocated == 0, m_len = %u", map->m_len); err = -EFSCORRUPTED; - goto out2; + goto errout; } out: @@ -3998,12 +3989,15 @@ map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - ext4_ext_show_leaf(inode, *ppath); -out2: - return err ? err : allocated; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + ext4_ext_show_leaf(inode, path); + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } /* @@ -4201,7 +4195,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; - int err = 0, depth, ret; + int err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4275,13 +4269,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; } - ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, &path, flags, - allocated, newblock); - if (ret < 0) - err = ret; - else - allocated = ret; + path = ext4_ext_handle_unwritten_extents( + handle, inode, map, path, flags, + &allocated, newblock); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } } From 4191eefef978d734fa8249bede3f9b02a85aa3c0 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:42 +0800 Subject: [PATCH 81/93] ext4: get rid of ppath in convert_initialized_extent() The use of path and ppath is now very confusing, so to make the code more readable, pass path between functions uniformly, and get rid of ppath. To get rid of the ppath in convert_initialized_extent(), the following is done here: * Free the extents path when an error is encountered. No functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-23-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 233afd884abc..dbadf2aa4376 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3811,13 +3811,12 @@ errout: return ERR_PTR(err); } -static int +static struct ext4_ext_path * convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path **ppath, + struct ext4_ext_path *path, unsigned int *allocated) { - struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3842,29 +3841,25 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (ee_block != map->m_lblk || ee_len > map->m_len) { path = ext4_split_convert_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } + if (IS_ERR(path)) + return path; path = ext4_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - *ppath = NULL; - return PTR_ERR(path); - } - *ppath = path; + if (IS_ERR(path)) + return path; depth = ext_depth(inode); ex = path[depth].p_ext; if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto errout; } } err = ext4_ext_get_access(handle, inode, path + depth); if (err) - return err; + goto errout; /* first mark the extent as unwritten */ ext4_ext_mark_unwritten(ex); @@ -3876,7 +3871,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (err) - return err; + goto errout; ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -3885,7 +3880,11 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (*allocated > map->m_len) *allocated = map->m_len; map->m_len = *allocated; - return 0; + return path; + +errout: + ext4_free_ext_path(path); + return ERR_PTR(err); } static struct ext4_ext_path * @@ -4256,8 +4255,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - err = convert_initialized_extent(handle, - inode, map, &path, &allocated); + path = convert_initialized_extent(handle, + inode, map, path, &allocated); + if (IS_ERR(path)) + err = PTR_ERR(path); goto out; } else if (!ext4_ext_is_unwritten(ex)) { map->m_flags |= EXT4_MAP_MAPPED; From a2c613b8c4860d5e70010e7391fff727c5d96bab Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:43 +0800 Subject: [PATCH 82/93] ext4: refactor ext4_swap_extents() to reuse extents path The ext4_find_extent() can update the extent path so it doesn't have to allocate and free path repeatedly, thus reducing the consumption of memory allocation and freeing in ext4_swap_extents(). Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-24-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 48 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dbadf2aa4376..e671ad642cfb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5663,25 +5663,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int e1_len, e2_len, len; int split = 0; - path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + path1 = ext4_find_extent(inode1, lblk1, path1, EXT4_EX_NOCACHE); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); - path1 = NULL; - finish: - count = 0; - goto repeat; + goto errout; } - path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + path2 = ext4_find_extent(inode2, lblk2, path2, EXT4_EX_NOCACHE); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); - path2 = NULL; - goto finish; + goto errout; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have something to swap ? */ if (unlikely(!ex2 || !ex1)) - goto finish; + goto errout; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); @@ -5703,7 +5699,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, next2 = e2_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) - goto finish; + goto errout; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) @@ -5713,7 +5709,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, lblk1 += len; lblk2 += len; count -= len; - goto repeat; + continue; } /* Prepare left boundary */ @@ -5723,7 +5719,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, path1, lblk1, 0); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); - goto finish; + goto errout; } } if (e2_blk < lblk2) { @@ -5732,13 +5728,13 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, path2, lblk2, 0); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); - goto finish; + goto errout; } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; /* Prepare right boundary */ len = count; @@ -5753,7 +5749,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, path1, lblk1 + len, 0); if (IS_ERR(path1)) { *erp = PTR_ERR(path1); - goto finish; + goto errout; } } if (len != e2_len) { @@ -5762,21 +5758,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, path2, lblk2 + len, 0); if (IS_ERR(path2)) { *erp = PTR_ERR(path2); - goto finish; + goto errout; } } /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) - goto repeat; + continue; BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; @@ -5794,7 +5790,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); if (unlikely(*erp)) - goto finish; + goto errout; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* @@ -5804,17 +5800,17 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, * aborted anyway. */ if (unlikely(*erp)) - goto finish; + goto errout; + lblk1 += len; lblk2 += len; replaced_count += len; count -= len; - - repeat: - ext4_free_ext_path(path1); - ext4_free_ext_path(path2); - path1 = path2 = NULL; } + +errout: + ext4_free_ext_path(path1); + ext4_free_ext_path(path2); return replaced_count; } From 2352e3e461926b59f01c1e39fbb0494891cff997 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:44 +0800 Subject: [PATCH 83/93] ext4: make some fast commit functions reuse extents path The ext4_find_extent() can update the extent path so that it does not have to allocate and free the path repeatedly, thus reducing the consumption of memory allocation and freeing in the following functions: ext4_ext_clear_bb ext4_ext_replay_set_iblocks ext4_fc_replay_add_range ext4_fc_set_bitmaps_and_counters No functional changes. Note that ext4_find_extent() does not support error pointers, so in this case set path to NULL first. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Tested-by: Ojaswin Mujoo Link: https://patch.msgid.link/20240822023545.1994557-25-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 51 +++++++++++++++++++------------------------ fs/ext4/fast_commit.c | 11 ++++++---- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e671ad642cfb..29aaa7340b09 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -6033,12 +6033,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); + if (!ex) goto out; - } end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); /* Count the number of data blocks */ cur = 0; @@ -6064,32 +6061,28 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) ret = skip_hole(inode, &cur); if (ret < 0) goto out; - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; numblks += path->p_depth; - ext4_free_ext_path(path); while (cur < end) { - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) break; ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto cleanup; + cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); ret = skip_hole(inode, &cur); - if (ret < 0) { - ext4_free_ext_path(path); + if (ret < 0) break; - } - path2 = ext4_find_extent(inode, cur, NULL, 0); - if (IS_ERR(path2)) { - ext4_free_ext_path(path); + + path2 = ext4_find_extent(inode, cur, path2, 0); + if (IS_ERR(path2)) break; - } + for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) { cmp1 = cmp2 = 0; if (i <= path->p_depth) @@ -6101,13 +6094,14 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) if (cmp1 != cmp2 && cmp2 != 0) numblks++; } - ext4_free_ext_path(path); - ext4_free_ext_path(path2); } out: inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9); ext4_mark_inode_dirty(NULL, inode); +cleanup: + ext4_free_ext_path(path); + ext4_free_ext_path(path2); return 0; } @@ -6128,12 +6122,9 @@ int ext4_ext_clear_bb(struct inode *inode) if (IS_ERR(path)) return PTR_ERR(path); ex = path[path->p_depth].p_ext; - if (!ex) { - ext4_free_ext_path(path); - return 0; - } + if (!ex) + goto out; end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); - ext4_free_ext_path(path); cur = 0; while (cur < end) { @@ -6143,16 +6134,16 @@ int ext4_ext_clear_bb(struct inode *inode) if (ret < 0) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); - if (!IS_ERR_OR_NULL(path)) { + path = ext4_find_extent(inode, map.m_lblk, path, 0); + if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) { - ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } - ext4_free_ext_path(path); + } else { + path = NULL; } ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, @@ -6161,5 +6152,7 @@ int ext4_ext_clear_bb(struct inode *inode) cur = cur + map.m_len; } +out: + ext4_free_ext_path(path); return 0; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index afcb2e5cdac7..eaa5f5b51f50 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1792,7 +1792,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ret == 0) { /* Range is not mapped */ - path = ext4_find_extent(inode, cur, NULL, 0); + path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); @@ -1808,7 +1808,6 @@ static int ext4_fc_replay_add_range(struct super_block *sb, up_write((&EXT4_I(inode)->i_data_sem)); if (IS_ERR(path)) goto out; - ext4_free_ext_path(path); goto next; } @@ -1856,6 +1855,7 @@ next: ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: + ext4_free_ext_path(path); iput(inode); return 0; } @@ -1956,12 +1956,13 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) break; if (ret > 0) { - path = ext4_find_extent(inode, map.m_lblk, NULL, 0); + path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); - ext4_free_ext_path(path); + } else { + path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, @@ -1972,6 +1973,8 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) } iput(inode); } + + ext4_free_ext_path(path); } /* From 5f48d4d9d8556ffa934537f0c000f9c3e108da66 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 22 Aug 2024 10:35:45 +0800 Subject: [PATCH 84/93] ext4: save unnecessary indentation in ext4_ext_create_new_leaf() Save an indentation level in ext4_ext_create_new_leaf() by removing unnecessary 'else'. Besides, the variable 'ee_block' is declared to avoid line breaks. No functional changes. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240822023545.1994557-26-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 48 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 29aaa7340b09..34e25eee6521 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1403,6 +1403,7 @@ ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, { struct ext4_ext_path *curp; int depth, i, err = 0; + ext4_lblk_t ee_block = le32_to_cpu(newext->ee_block); repeat: i = depth = ext_depth(inode); @@ -1424,33 +1425,30 @@ repeat: goto errout; /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + path = ext4_find_extent(inode, ee_block, path, gb_flags); return path; - } else { - /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags); - if (err) - goto errout; - - /* refill path */ - path = ext4_find_extent(inode, - (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); - if (IS_ERR(path)) - return path; - - /* - * only first (depth 0 -> 1) produces free space; - * in all other cases we have to split the grown tree - */ - depth = ext_depth(inode); - if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { - /* now we need to split */ - goto repeat; - } } + + /* tree is full, time to grow in depth */ + err = ext4_ext_grow_indepth(handle, inode, mb_flags); + if (err) + goto errout; + + /* refill path */ + path = ext4_find_extent(inode, ee_block, path, gb_flags); + if (IS_ERR(path)) + return path; + + /* + * only first (depth 0 -> 1) produces free space; + * in all other cases we have to split the grown tree + */ + depth = ext_depth(inode); + if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { + /* now we need to split */ + goto repeat; + } + return path; errout: From a2187431c395cdfbf144e3536f25468c64fc7cfa Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 27 Aug 2024 16:16:36 -0400 Subject: [PATCH 85/93] ext4: fix error message when rejecting the default hash Commit 985b67cd8639 ("ext4: filesystems without casefold feature cannot be mounted with siphash") properly rejects volumes where s_def_hash_version is set to DX_HASH_SIPHASH, but the check and the error message should not look into casefold setup - a filesystem should never have DX_HASH_SIPHASH as the default hash. Fix it and, since we are there, move the check to ext4_hash_info_init. Fixes:985b67cd8639 ("ext4: filesystems without casefold feature cannot be mounted with siphash") Signed-off-by: Gabriel Krisman Bertazi Link: https://patch.msgid.link/87jzg1en6j.fsf_-_@mailhost.krisman.be Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 481ece3660eb..7ac668d4ce83 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2462,6 +2462,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 +#define DX_HASH_LAST DX_HASH_SIPHASH static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 58423e6bf3d0..adc5046fe9dd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3583,13 +3583,6 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly) "mounted without CONFIG_UNICODE"); return 0; } - if (EXT4_SB(sb)->s_es->s_def_hash_version == DX_HASH_SIPHASH && - !ext4_has_feature_casefold(sb)) { - ext4_msg(sb, KERN_ERR, - "Filesystem without casefold feature cannot be " - "mounted with siphash"); - return 0; - } if (readonly) return 1; @@ -5095,16 +5088,27 @@ out: return ret; } -static void ext4_hash_info_init(struct super_block *sb) +static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; + sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_def_hash_version > DX_HASH_LAST) { + ext4_msg(sb, KERN_ERR, + "Invalid default hash set in the superblock"); + return -EINVAL; + } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { + ext4_msg(sb, KERN_ERR, + "SIPHASH is not a valid default hash value"); + return -EINVAL; + } + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) @@ -5122,6 +5126,7 @@ static void ext4_hash_info_init(struct super_block *sb) #endif } } + return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) @@ -5257,7 +5262,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount; - ext4_hash_info_init(sb); + err = ext4_hash_info_init(sb); + if (err) + goto failed_mount; err = ext4_handle_clustersize(sb); if (err) From 59efe53e380ee305ec11378233adb6aaebe1856c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Thu, 29 Aug 2024 19:02:22 +0800 Subject: [PATCH 86/93] ext4: dax: keep orphan list before truncate overflow allocated blocks Any extending write for ext4 requires the inode to be placed on the orphan list before the actual write. In addition, the inode can be actually removed from the orphan list only after all writes are completed. Otherwise we'd leave allocated blocks beyond i_disksize if we could not copy all the data into allocated block and e2fsck would complain. Currently, direct IO and buffered IO comply with this logic(buffered IO will truncate all overflow allocated blocks that has not been written successfully, and direct IO will truncate all allocated blocks when error occurs). However, dax write break this since dax write will remove the inode from the orphan list by calling ext4_handle_inode_extension unconditionally during extending write. We add a argument to help determine does we do a fully write, and for the case not fully write, we leave the inode on the orphan list, and the latter ext4_inode_extension_cleanup will help us truncate the overflow allocated blocks, and then remove the inode from the orphan list. Signed-off-by: yangerkun Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240829110222.126685-1-yangerkun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index be061bb64067..f14aed14b9cf 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -306,7 +306,7 @@ out: } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, - ssize_t count) + ssize_t written, ssize_t count) { handle_t *handle; @@ -315,7 +315,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, if (IS_ERR(handle)) return PTR_ERR(handle); - if (ext4_update_inode_size(inode, offset + count)) { + if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); @@ -323,11 +323,11 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, } } - if (inode->i_nlink) + if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - return count; + return written; } /* @@ -393,7 +393,7 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return size; - return ext4_handle_inode_extension(inode, pos, size); + return ext4_handle_inode_extension(inode, pos, size, size); } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -669,7 +669,7 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { - ret = ext4_handle_inode_extension(inode, offset, ret); + ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: From 3910b513fcdf33030798755c722174ef4a827d2a Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:36 +0800 Subject: [PATCH 87/93] ext4: persist the new uptodate buffers in ext4_journalled_zero_new_buffers For new uptodate buffers we also need to call write_end_fn() to persist the uptodate content, similarly as folio_zero_new_buffers() does it. Suggested-by: Jan Kara Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-2-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6f1286e824d..5ffd56ceeddd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1371,9 +1371,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; folio_zero_range(folio, start, size); - write_end_fn(handle, inode, bh); } clear_buffer_new(bh); + write_end_fn(handle, inode, bh); } } block_start = block_end; From 6b730a405037501a260d6efd24782d2737e65d07 Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:37 +0800 Subject: [PATCH 88/93] ext4: hoist ext4_block_write_begin and replace the __block_write_begin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using __block_write_begin() make it inconvenient to journal the user data dirty process. We can't tell the block layer maintainer, ‘Hey, we want to trace the dirty user data in ext4, can we add some special code for ext4 in __block_write_begin?’:P So use ext4_block_write_begin() instead. The two functions are basically doing the same thing except for the fscrypt related code. Remove the unnecessary #ifdef since fscrypt_inode_uses_fs_layer_crypto() returns false (and it's known at compile time) when !CONFIG_FS_ENCRYPTION. And hoist the ext4_block_write_begin so that it can be used in other files. Suggested-by: Jan Kara Suggested-by: Eric Biggers Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-3-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ fs/ext4/inline.c | 10 +++++----- fs/ext4/inode.c | 24 +++++------------------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7ac668d4ce83..8bd08637fe14 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3853,6 +3853,8 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } +extern int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, + get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 44a5f6df59ec..b018a80fec7a 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,10 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, - ext4_get_block_unwritten); + ret = ext4_block_write_begin(folio, from, to, + ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = ext4_block_write_begin(folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -856,8 +856,8 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, 0, inline_size, + ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); folio_unlock(folio); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5ffd56ceeddd..0010f7004688 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1006,10 +1006,10 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* - * __block_write_begin() could have dirtied some buffers. Clean + * ext4_block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit - * by __block_write_begin() isn't a real problem here as we clear + * by ext4_block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ @@ -1023,9 +1023,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, return ret; } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, - get_block_t *get_block) +int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, + get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1116,7 +1115,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, return err; } -#endif /* * To preserve ordering, it is essential that the hole instantiation and @@ -1198,19 +1196,11 @@ retry_journal: /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); -#ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); -#else - if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, - ext4_get_block_unwritten); - else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); -#endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -1223,7 +1213,7 @@ retry_journal: folio_unlock(folio); /* - * __block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * @@ -2936,11 +2926,7 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); -#ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); -#else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); -#endif if (ret < 0) { folio_unlock(folio); folio_put(folio); From cb3de5fc876ee9ef2b830c9e6cdafac5c90903ef Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:38 +0800 Subject: [PATCH 89/93] ext4: fix a potential assertion failure due to improperly dirtied buffer On an old kernel version(4.19, ext3, data=journal, pagesize=64k), an assertion failure will occasionally be triggered by the line below: ----------- jbd2_journal_commit_transaction { ... J_ASSERT_BH(bh, !buffer_dirty(bh)); /* * The buffer on BJ_Forget list and not jbddirty means ... } ----------- The same condition may also be applied to the lattest kernel version. When blocksize < pagesize and we truncate a file, there can be buffers in the mapping tail page beyond i_size. These buffers will be filed to transaction's BJ_Forget list by ext4_journalled_invalidatepage() during truncation. When the transaction doing truncate starts committing, we can grow the file again. This calls __block_write_begin() which allocates new blocks under these buffers in the tail page we go through the branch: if (buffer_new(bh)) { clean_bdev_bh_alias(bh); if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } ... } Hence buffers on BJ_Forget list of the committing transaction get marked dirty and this triggers the jbd2 assertion. Teach ext4_block_write_begin() to properly handle files with data journalling by avoiding dirtying them directly. Instead of folio_zero_new_buffers() we use ext4_journalled_zero_new_buffers() which takes care of handling journalling. We also don't need to mark new uptodate buffers as dirty in ext4_block_write_begin(). That will be either done either by block_commit_write() in case of success or by folio_zero_new_buffers() in case of failure. Reported-by: Baolin Liu Suggested-by: Jan Kara Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-4-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/inline.c | 7 ++++--- fs/ext4/inode.c | 42 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8bd08637fe14..8cc15d00e5c8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3853,7 +3853,8 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) return buffer_uptodate(bh); } -extern int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, +extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, get_block_t *get_block); #endif /* __KERNEL__ */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b018a80fec7a..7ca4aca53162 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,11 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = ext4_block_write_begin(folio, from, to, + ret = ext4_block_write_begin(handle, folio, from, to, ext4_get_block_unwritten); } else - ret = ext4_block_write_begin(folio, from, to, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, from, to, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -856,7 +857,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = ext4_block_write_begin(folio, 0, inline_size, + ret = ext4_block_write_begin(NULL, folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0010f7004688..a11e2626b5f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -49,6 +49,11 @@ #include +static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, + struct folio *folio, + unsigned from, unsigned to); + static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -1023,7 +1028,8 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, return ret; } -int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, +int ext4_block_write_begin(handle_t *handle, struct folio *folio, + loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); @@ -1037,6 +1043,7 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; + bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); @@ -1066,10 +1073,22 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { + /* + * We may be zeroing partial buffers or all new + * buffers in case of failure. Prepare JBD2 for + * that. + */ + if (should_journal_data) + do_journal_get_write_access(handle, + inode, bh); if (folio_test_uptodate(folio)) { - clear_buffer_new(bh); + /* + * Unlike __block_write_begin() we leave + * dirtying of new uptodate buffers to + * ->write_end() time or + * folio_zero_new_buffers(). + */ set_buffer_uptodate(bh); - mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) @@ -1099,7 +1118,11 @@ int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - folio_zero_new_buffers(folio, from, to); + if (should_journal_data) + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); + else + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1197,10 +1220,11 @@ retry_journal: folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(folio, pos, len, + ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); + ret = ext4_block_write_begin(handle, folio, pos, len, + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, @@ -2926,7 +2950,8 @@ retry: if (IS_ERR(folio)) return PTR_ERR(folio); - ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); + ret = ext4_block_write_begin(NULL, folio, pos, len, + ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -6183,7 +6208,8 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = ext4_block_write_begin(handle, folio, 0, len, + ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) From 183aa1d3baea18b199505455a0247b13de826e2f Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Fri, 30 Aug 2024 13:37:39 +0800 Subject: [PATCH 90/93] ext4: remove the special buffer dirty handling in do_journal_get_write_access This kinda revert the commit 56d35a4cd13e("ext4: Fix dirtying of journalled buffers in data=journal mode") made by Jan 14 years ago, since the do_get_write_access() itself can deal with the extra unexpected buf dirting things in a proper way now. Suggested-by: Jan Kara Signed-off-by: Shida Zhang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830053739.3588573-5-zhangshida@kylinos.cn Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a11e2626b5f6..7475deef9793 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1005,27 +1005,11 @@ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { - int dirty = buffer_dirty(bh); - int ret; - if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - /* - * ext4_block_write_begin() could have dirtied some buffers. Clean - * the dirty bit as jbd2_journal_get_write_access() could complain - * otherwise about fs integrity issues. Setting of the dirty bit - * by ext4_block_write_begin() isn't a real problem here as we clear - * the bit before releasing a page lock and thus writeback cannot - * ever write the buffer. - */ - if (dirty) - clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); - if (!ret && dirty) - ret = ext4_dirty_journalled_data(handle, bh); - return ret; } int ext4_block_write_begin(handle_t *handle, struct folio *folio, From cc749e61c011c255d81b192a822db650c68b313f Mon Sep 17 00:00:00 2001 From: Artem Sadovnikov Date: Thu, 29 Aug 2024 15:22:09 +0000 Subject: [PATCH 91/93] ext4: fix i_data_sem unlock order in ext4_ind_migrate() Fuzzing reports a possible deadlock in jbd2_log_wait_commit. This issue is triggered when an EXT4_IOC_MIGRATE ioctl is set to require synchronous updates because the file descriptor is opened with O_SYNC. This can lead to the jbd2_journal_stop() function calling jbd2_might_wait_for_commit(), potentially causing a deadlock if the EXT4_IOC_MIGRATE call races with a write(2) system call. This problem only arises when CONFIG_PROVE_LOCKING is enabled. In this case, the jbd2_might_wait_for_commit macro locks jbd2_handle in the jbd2_journal_stop function while i_data_sem is locked. This triggers lockdep because the jbd2_journal_start function might also lock the same jbd2_handle simultaneously. Found by Linux Verification Center (linuxtesting.org) with syzkaller. Reviewed-by: Ritesh Harjani (IBM) Co-developed-by: Mikhail Ukhin Signed-off-by: Mikhail Ukhin Signed-off-by: Artem Sadovnikov Rule: add Link: https://lore.kernel.org/stable/20240404095000.5872-1-mish.uxin2012%40yandex.ru Link: https://patch.msgid.link/20240829152210.2754-1-ancowi69@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 0f68b8a14560..1b0dfd963d3f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -664,8 +664,8 @@ int ext4_ind_migrate(struct inode *inode) if (unlikely(ret2 && !ret)) ret = ret2; errout: - ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; From ee85e0938aa8f9846d21e4d302c3cf6a2a75110d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 30 Aug 2024 12:50:57 +0530 Subject: [PATCH 92/93] ext4: check stripe size compatibility on remount as well We disable stripe size in __ext4_fill_super if it is not a multiple of the cluster ratio however this check is missed when trying to remount. This can leave us with cases where stripe < cluster_ratio after remount:set making EXT4_B2C(sbi->s_stripe) become 0 that can cause some unforeseen bugs like divide by 0. Fix that by adding the check in remount path as well. Reported-by: syzbot+1ad8bac5af24d01e2cbd@syzkaller.appspotmail.com Tested-by: syzbot+1ad8bac5af24d01e2cbd@syzkaller.appspotmail.com Reviewed-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Fixes: c3defd99d58c ("ext4: treat stripe in block unit") Signed-off-by: Ojaswin Mujoo Link: https://patch.msgid.link/3a493bb503c3598e25dcfbed2936bb2dff3fece7.1725002410.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index adc5046fe9dd..16a4ce704460 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5178,6 +5178,18 @@ static int ext4_block_group_meta_init(struct super_block *sb, int silent) return 0; } +/* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simplify code and avoid + * stripe aligned allocation which will rarely succeed. + */ +static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + return (stripe > 0 && sbi->s_cluster_ratio > 1 && + stripe % sbi->s_cluster_ratio != 0); +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; @@ -5287,13 +5299,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); - /* - * It's hard to get stripe aligned blocks if stripe is not aligned with - * cluster, just disable stripe and alert user to simpfy code and avoid - * stripe aligned allocation which will rarely successes. - */ - if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && - sbi->s_stripe % sbi->s_cluster_ratio != 0) { + if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", @@ -6457,6 +6463,15 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } + if ((ctx->spec & EXT4_SPEC_s_stripe) && + ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + ctx->s_stripe, sbi->s_cluster_ratio); + ctx->s_stripe = 0; + } + /* * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * two calls to ext4_should_dioread_nolock() to return inconsistent From ff2beee206d23f49d022650122f81285849033e4 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 30 Aug 2024 12:50:58 +0530 Subject: [PATCH 93/93] ext4: convert EXT4_B2C(sbi->s_stripe) users to EXT4_NUM_B2C Although we have checks to make sure s_stripe is a multiple of cluster size, in case we accidentally end up with a scenario where this is not the case, use EXT4_NUM_B2C() so that we don't end up with unexpected cases where EXT4_B2C(stripe) becomes 0. Also make the is_stripe_aligned check in regular_allocator a bit more robust while we are at it. This should ideally have no functional change unless we have a bug somewhere causing (stripe % cluster_size != 0) Reviewed-by: Kemeng Shi Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://patch.msgid.link/e0c0a3b58a40935a1361f668851d041575861411.1725002410.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bf9ba7effbd5..d73e38323879 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2356,7 +2356,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && - ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { + ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2553,7 +2553,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - stripe = EXT4_B2C(sbi, sbi->s_stripe); + stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { @@ -2928,9 +2928,11 @@ repeat: if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); else { - bool is_stripe_aligned = sbi->s_stripe && + bool is_stripe_aligned = + (sbi->s_stripe >= + sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % - EXT4_B2C(sbi, sbi->s_stripe)); + EXT4_NUM_B2C(sbi, sbi->s_stripe)); if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && @@ -3706,7 +3708,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); + sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);