diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de5e4f2adfea..0ef5cc13fae2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,7 +66,11 @@ struct btrfs_inode { */ struct btrfs_key location; - /* Lock for counters */ + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ @@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); if (BTRFS_I(inode)->logged_trans == generation && BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && @@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) */ smp_mb(); if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) - return 1; + ret = 1; } - return 0; + spin_unlock(&BTRFS_I(inode)->lock); + return ret; } #define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 150db5e50c2d..fd105c172c8b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1811,7 +1811,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 937050a2b68e..96b189b8898a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -136,9 +136,11 @@ struct btrfs_pending_snapshot { static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6c95159302dd..016c90fc85db 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -492,11 +492,19 @@ insert: if (btrfs_inode_generation(eb, src_item) == 0) { struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && - S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { struct btrfs_map_token token; - u64 ino_size = btrfs_inode_size(eb, src_item); btrfs_init_map_token(&token); btrfs_set_token_inode_size(dst_eb, dst_item, @@ -3124,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; @@ -3208,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) @@ -3218,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, err = ret; goto done; } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * + * + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; } path->slots[0] = nritems; @@ -3279,7 +3321,8 @@ done: static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { u64 min_key; u64 max_key; @@ -3291,7 +3334,7 @@ again: max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, + dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -4067,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, if (ret < 0) { return ret; } else if (ret > 0) { - *size_ret = i_size_read(inode); + *size_ret = 0; } else { struct btrfs_inode_item *item; @@ -4374,15 +4417,18 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4469,6 +4515,181 @@ out: return ret; } +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4491,6 +4712,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, const struct dentry * const first_parent = parent; const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; sb = inode->i_sb; @@ -4546,6 +4769,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4582,7 +4808,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, dput(old_parent); old_parent = parent; } - ret = 0; + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; end_trans: dput(old_parent); if (ret < 0) { diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 154990c26dcb..6916a781ea02 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -29,6 +29,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; int io_err; + bool log_new_dentries; struct list_head list; }; @@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; + ctx->log_new_dentries = false; INIT_LIST_HEAD(&ctx->list); }