diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50656edad939..9e21fbcbac6d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4133,9 +4133,18 @@ skip_backref: goto err; } - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); + /* + * If we are in a rename context, we don't need to update anything in the + * log. That will be done later during the rename by btrfs_log_new_name(). + * Besides that, doing it here would only cause extra unncessary btree + * operations on the log tree, increasing latency for applications. + */ + if (!rename_ctx) { + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, + index); + } /* * If we have a pending delayed iput we could end up with the final iput @@ -9041,8 +9050,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_idx = 0; int ret; int ret2; - bool root_log_pinned = false; - bool dest_log_pinned = false; bool need_abort = false; /* @@ -9145,29 +9152,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } - /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. - * - * We pin the logs even if at this precise moment none of the inodes was - * logged before. This is because right after we checked for that, some - * other task fsyncing some other inode not involved with this rename - * operation could log that one of our inodes exists. - * - * We don't need to pin the logs before the above calls to - * btrfs_insert_inode_ref(), since those don't ever need to change a log. - */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(root); - root_log_pinned = true; - } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; - } - /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9223,46 +9207,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (root_log_pinned) { + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_pin_log_trans(dest); + + /* Do the log updates for all inodes. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } -out_fail: - /* - * If we have pinned a log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) - btrfs_set_log_full_commit(trans); - if (root_log_pinned) { - btrfs_end_log_trans(root); - root_log_pinned = false; - } - if (dest_log_pinned) { - btrfs_end_log_trans(dest); - dest_log_pinned = false; - } - } + /* Now unpin the logs. */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(root); + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) + btrfs_end_log_trans(dest); +out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -9342,7 +9311,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); - bool log_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9447,25 +9415,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { - /* - * Now pin the log. We do it to ensure that no other task can - * sync the log while we are in progress with the rename, as - * that could result in an inconsistency in case any of the - * inodes that are part of this rename operation were logged - * before. - * - * We pin the log even if at this precise moment none of the - * inodes was logged before. This is because right after we - * checked for that, some other task fsyncing some other inode - * not involved with this rename operation could log that one of - * our inodes exists. - * - * We don't need to pin the logs before the above call to - * btrfs_insert_inode_ref(), since that does not need to change - * a log. - */ - btrfs_pin_log_trans(root); - log_pinned = true; ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, @@ -9512,12 +9461,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (log_pinned) { + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); - btrfs_end_log_trans(root); - log_pinned = false; - } if (flags & RENAME_WHITEOUT) { ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, @@ -9529,28 +9475,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } } out_fail: - /* - * If we have pinned the log and an error happened, we unpin tasks - * trying to sync the log and force them to fallback to a transaction - * commit if the log currently contains any of the inodes involved in - * this rename operation (to ensure we do not persist a log with an - * inconsistent state for any of these inodes or leading to any - * inconsistencies when replayed). If the transaction was aborted, the - * abortion reason is propagated to userspace when attempting to commit - * the transaction. If the log does not contain any of these inodes, we - * allow the tasks to sync it. - */ - if (ret && log_pinned) { - if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || - btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) - btrfs_set_log_full_commit(trans); - - btrfs_end_log_trans(root); - log_pinned = false; - } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 44719e9c48f3..d697f8a33a5a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6810,7 +6810,10 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, u64 old_dir_index, struct dentry *parent) { struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry)); + struct btrfs_root *root = inode->root; struct btrfs_log_ctx ctx; + bool log_pinned = false; + int ret = 0; /* * this will force the logging code to walk the dentry chain @@ -6837,14 +6840,22 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, if (old_dir && old_dir->logged_trans == trans->transid) { struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; - int ret; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + /* + * We have two inodes to update in the log, the old directory and + * the inode that got renamed, so we must pin the log to prevent + * anyone from syncing the log until we have updated both inodes + * in the log. + */ + log_pinned = true; + btrfs_pin_log_trans(root); + path = btrfs_alloc_path(); if (!path) { - btrfs_set_log_full_commit(trans); - return; + ret = -ENOMEM; + goto out; } /* @@ -6874,10 +6885,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, mutex_unlock(&old_dir->log_mutex); btrfs_free_path(path); - if (ret < 0) { - btrfs_set_log_full_commit(trans); - return; - } + if (ret < 0) + goto out; } btrfs_init_log_ctx(&ctx, &inode->vfs_inode); @@ -6890,5 +6899,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * inconsistent state after a rename operation. */ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); +out: + if (log_pinned) { + /* + * If an error happened mark the log for a full commit because + * it's not consistent and up to date. Do it before unpinning the + * log, to avoid any races with someone else trying to commit it. + */ + if (ret < 0) + btrfs_set_log_full_commit(trans); + btrfs_end_log_trans(root); + } }