From 9d86178782a25fac105e550e1c29c7d3f8470116 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Oct 2024 21:23:41 -0400 Subject: [PATCH] bcachefs: bch2_inode_or_descendents_is_open() fsck can now correctly check if inodes in interior snapshot nodes are open/in use. - Tweak the vfs inode rhashtable so that the subvolume ID isn't hashed, meaning inums in different subvolumes will hash to the same slot. Note that this is a hack, and will cause problems if anyone ever has the same file in many different snapshots open all at the same time. - Then check if any of those subvolumes is a descendent of the snapshot ID being checked Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 106 +++++++++++++++++++++++++++++++++++++------- fs/bcachefs/fs.h | 6 +-- fs/bcachefs/fsck.c | 7 ++- fs/bcachefs/inode.c | 5 ++- 4 files changed, 103 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 23cae92d313d..e9e32d21f82d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,32 +184,93 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { - return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); -} + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; -bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) -{ if (!test_bit(BCH_FS_started, &c->flags)) return false; - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; + darray_init(&subvols); +restart_from_top: - /* snapshot tree interior node, can't safely delete while online (yet) */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; } +err: + darray_exit(&subvols); + return ret; +} - return __bch2_inode_hash_find(c, inum) != NULL; +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } static void __wait_on_freeing_inode(struct bch_fs *c, @@ -271,7 +346,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 40dbd5774d0b..59f9f7ae728d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -146,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -179,8 +181,6 @@ void bch2_inode_update_after_write(struct btree_trans *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -bool bch2_inode_is_open(struct bch_fs *c, struct bpos p); - int bch2_setattr_nonsize(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); @@ -198,7 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; } +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f00a36f62323..a1087fd292e4 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1213,7 +1213,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1221,6 +1225,7 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 9d6040d4ba39..2c037e84fbae 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1244,8 +1244,9 @@ next_parent: if (!unlinked) return 0; - if (bch2_inode_is_open(trans->c, pos)) - return 0; + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); if (ret)