bcachefs: switch to rhashtable for vfs inodes hash

the standard vfs inode hash table suffers from painful lock contention -
this is long overdue

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-06-08 21:41:01 -04:00
parent 88d2ae0e6e
commit 112d21fd1a
12 changed files with 160 additions and 89 deletions

View File

@ -361,7 +361,7 @@ retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
acl = _acl; acl = _acl;
ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_intent); BTREE_ITER_intent);
if (ret) if (ret)

View File

@ -1023,6 +1023,7 @@ struct bch_fs {
/* fs.c */ /* fs.c */
struct list_head vfs_inodes_list; struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock; struct mutex vfs_inodes_lock;
struct rhashtable vfs_inodes_table;
/* VFS IO PATH - fs-io.c */ /* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset; struct bio_set writepage_bioset;

View File

@ -486,7 +486,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas; op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied); op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol; op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector); op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done; op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush; op->devs_need_flush = &inode->ei_devs_need_flush;

View File

@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target; dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas; dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol; dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush; dio->op.devs_need_flush = &inode->ei_devs_need_flush;

View File

@ -267,7 +267,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the * XXX: we're doing two index lookups when we end up reading the
* folio * folio
*/ */
ret = range_has_data(c, inode->ei_subvol, ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0) if (ret <= 0)
@ -618,7 +618,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, ret = bch2_subvolume_get_snapshot(trans,
inode->ei_subvol, &snapshot); inode->ei_inum.subvol, &snapshot);
if (ret) if (ret)
goto bkey_err; goto bkey_err;
@ -823,7 +823,7 @@ static int quota_reserve_range(struct bch_inode_info *inode,
retry: retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
if (ret) if (ret)
goto err; goto err;

View File

@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
} }
mutex_lock(&inode->ei_update_lock); mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s, bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME); ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock); mutex_unlock(&inode->ei_update_lock);
@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
} }
mutex_lock(&inode->ei_update_lock); mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?: bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME); ATTR_CTIME);

View File

@ -108,7 +108,7 @@ retry:
goto retry; goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
"%s: inode %u:%llu not found when updating", "%s: inode %llu:%llu not found when updating",
bch2_err_str(ret), bch2_err_str(ret),
inode_inum(inode).subvol, inode_inum(inode).subvol,
inode_inum(inode).inum); inode_inum(inode).inum);
@ -152,50 +152,95 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret; return ret;
} }
static int bch2_iget5_test(struct inode *vinode, void *p) static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{ {
struct bch_inode_info *inode = to_bch_ei(vinode); return a.subvol == b.subvol && a.inum == b.inum;
subvol_inum *inum = p;
return inode->ei_subvol == inum->subvol &&
inode->ei_inode.bi_inum == inum->inum;
} }
static int bch2_iget5_set(struct inode *vinode, void *p) static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{ {
struct bch_inode_info *inode = to_bch_ei(vinode); const struct bch_inode_info *inode = obj;
subvol_inum *inum = p; const subvol_inum *v = arg->key;
inode->v.i_ino = inum->inum; return !subvol_inum_eq(inode->ei_inum, *v);
inode->ei_subvol = inum->subvol;
inode->ei_inode.bi_inum = inum->inum;
return 0;
} }
static unsigned bch2_inode_hash(subvol_inum inum) static const struct rhashtable_params bch2_vfs_inodes_params = {
.head_offset = offsetof(struct bch_inode_info, hash),
.key_offset = offsetof(struct bch_inode_info, ei_inum),
.key_len = sizeof(subvol_inum),
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
.automatic_shrinking = true,
};
static void __wait_on_freeing_inode(struct inode *inode)
{ {
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
} }
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{ {
return to_bch_ei(ilookup5_nowait(c->vfs_sb, return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
} }
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{ {
subvol_inum inum = inode_inum(inode); struct bch_inode_info *inode;
struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, repeat:
bch2_inode_hash(inum), inode = __bch2_inode_hash_find(c, inum);
bch2_iget5_test, if (inode) {
bch2_iget5_set, spin_lock(&inode->v.i_lock);
&inum)); if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
BUG_ON(!old); spin_unlock(&inode->v.i_lock);
return NULL;
}
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
__wait_on_freeing_inode(&inode->v);
goto repeat;
}
__iget(&inode->v);
spin_unlock(&inode->v.i_lock);
}
return inode;
}
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
{
spin_lock(&inode->v.i_lock);
bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
spin_unlock(&inode->v.i_lock);
if (remove) {
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
inode->v.i_hash.pprev = NULL;
}
}
static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
struct bch_inode_info *old = inode;
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
&inode->hash,
bch2_vfs_inodes_params))) {
old = bch2_inode_hash_find(c, inode->ei_inum);
if (!old)
goto retry;
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
if (unlikely(old != inode)) {
/* /*
* bcachefs doesn't use I_NEW; we have no use for it since we * bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But * only insert fully created inodes in the inode hash table. But
@ -211,16 +256,13 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
discard_new_inode(&inode->v); discard_new_inode(&inode->v);
inode = old; inode = old;
} else { } else {
inode_fake_hash(&inode->v);
inode_sb_list_add(&inode->v);
mutex_lock(&c->vfs_inodes_lock); mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock); mutex_unlock(&c->vfs_inodes_lock);
/*
* Again, I_NEW makes no sense for bcachefs. This is only needed
* for clearing I_NEW, but since the inode was already fully
* created and initialized we didn't actually want
* inode_insert5() to set it for us.
*/
unlock_new_inode(&inode->v);
} }
return inode; return inode;
@ -285,11 +327,7 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{ {
struct bch_inode_info *inode = struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode) if (inode)
return &inode->v; return &inode->v;
@ -303,7 +341,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (!ret) { if (!ret) {
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode); inode = bch2_inode_hash_insert(c, inode);
} }
bch2_trans_put(trans); bch2_trans_put(trans);
@ -351,7 +389,7 @@ __bch2_create(struct mnt_idmap *idmap,
retry: retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans, bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u, inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE) !(flags & BCH_CREATE_TMPFILE)
@ -365,7 +403,7 @@ retry:
if (unlikely(ret)) if (unlikely(ret))
goto err_before_quota; goto err_before_quota;
inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum; inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true, ret = bch2_subvolume_get(trans, inum.subvol, true,
@ -396,7 +434,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another * bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it: * thread pulling the inode in and modifying it:
*/ */
inode = bch2_inode_insert(c, inode); inode = bch2_inode_hash_insert(c, inode);
bch2_trans_put(trans); bch2_trans_put(trans);
err: err:
posix_acl_release(default_acl); posix_acl_release(default_acl);
@ -436,11 +474,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret) if (ret)
goto err; goto err;
struct bch_inode_info *inode = struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode) if (inode)
goto out; goto out;
@ -470,7 +504,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
} }
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode); inode = bch2_inode_hash_insert(c, inode);
out: out:
bch2_trans_iter_exit(trans, &dirent_iter); bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf); printbuf_exit(&buf);
@ -557,8 +591,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem); lockdep_assert_held(&inode->v.i_rwsem);
ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
bch2_subvol_is_ro(c, inode->ei_subvol) ?: bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry); __bch2_link(c, inode, dir, dentry);
if (unlikely(ret)) if (unlikely(ret))
return bch2_err_class(ret); return bch2_err_class(ret);
@ -614,7 +648,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info; struct bch_fs *c = dir->v.i_sb->s_fs_info;
int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false); __bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret); return bch2_err_class(ret);
} }
@ -697,8 +731,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
trans = bch2_trans_get(c); trans = bch2_trans_get(c);
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret) if (ret)
goto err; goto err;
@ -899,7 +933,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c); stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks; stat->blocks = inode->v.i_blocks;
stat->subvol = inode->ei_subvol; stat->subvol = inode->ei_inum.subvol;
stat->result_mask |= STATX_SUBVOL; stat->result_mask |= STATX_SUBVOL;
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@ -941,7 +975,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem); lockdep_assert_held(&inode->v.i_rwsem);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr); setattr_prepare(idmap, dentry, iattr);
if (ret) if (ret)
return ret; return ret;
@ -1053,7 +1087,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
retry: retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
if (ret) if (ret)
goto err; goto err;
@ -1173,7 +1207,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret = bch2_subvol_is_ro(c, inode->ei_subvol); int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret) if (ret)
return ret; return ret;
} }
@ -1305,8 +1339,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{ {
return (struct bcachefs_fid) { return (struct bcachefs_fid) {
.inum = inode->ei_inode.bi_inum, .inum = inode->ei_inum.inum,
.subvol = inode->ei_subvol, .subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation, .gen = inode->ei_inode.bi_generation,
}; };
} }
@ -1391,7 +1425,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = { subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?: .subvol = inode->ei_inode.bi_parent_subvol ?:
inode->ei_subvol, inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir, .inum = inode->ei_inode.bi_dir,
}; };
@ -1427,7 +1461,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry: retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret) if (ret)
goto err; goto err;
@ -1458,8 +1492,7 @@ retry:
if (ret) if (ret)
goto err; goto err;
if (target.subvol == inode->ei_subvol && if (subvol_inum_eq(target, inode->ei_inum))
target.inum == inode->ei_inode.bi_inum)
goto found; goto found;
} else { } else {
/* /*
@ -1480,8 +1513,7 @@ retry:
if (ret) if (ret)
continue; continue;
if (target.subvol == inode->ei_subvol && if (subvol_inum_eq(target, inode->ei_inum))
target.inum == inode->ei_inode.bi_inum)
goto found; goto found;
} }
} }
@ -1518,7 +1550,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi, struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol) struct bch_subvolume *subvol)
{ {
bch2_iget5_set(&inode->v, &inum); inode->v.i_ino = inum.inum;
inode->ei_inum = inum;
inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0); bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors; inode->v.i_blocks = bi->bi_sectors;
@ -1530,7 +1564,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0; inode->ei_flags = 0;
inode->ei_quota_reserved = 0; inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi); inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol)) if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@ -1597,6 +1630,17 @@ static void bch2_evict_inode(struct inode *vinode)
{ {
struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *inode = to_bch_ei(vinode);
bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v);
/*
* evict() has waited for outstanding writeback, we'll do no more IO
* through this inode: it's safe to remove from VFS inode hashtable here
*
* Do that now so that other threads aren't blocked from pulling it back
* in, there's no reason for them to be:
*/
if (!delete)
bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data); truncate_inode_pages_final(&inode->v.i_data);
@ -1604,12 +1648,18 @@ static void bch2_evict_inode(struct inode *vinode)
BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { if (delete) {
bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
KEY_TYPE_QUOTA_WARN); KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN); KEY_TYPE_QUOTA_WARN);
bch2_inode_rm(c, inode_inum(inode)); bch2_inode_rm(c, inode_inum(inode));
/*
* If we are deleting, we need it present in the vfs hash table
* so that fsck can check if unlinked inodes are still open:
*/
bch2_inode_hash_remove(c, inode);
} }
mutex_lock(&c->vfs_inodes_lock); mutex_lock(&c->vfs_inodes_lock);
@ -1639,7 +1689,7 @@ again:
mutex_lock(&c->vfs_inodes_lock); mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
if (!snapshot_list_has_id(s, inode->ei_subvol)) if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue; continue;
if (!(inode->v.i_state & I_DONTCACHE) && if (!(inode->v.i_state & I_DONTCACHE) &&
@ -2127,6 +2177,17 @@ static int bch2_init_fs_context(struct fs_context *fc)
return 0; return 0;
} }
void bch2_fs_vfs_exit(struct bch_fs *c)
{
if (c->vfs_inodes_table.tbl)
rhashtable_destroy(&c->vfs_inodes_table);
}
int bch2_fs_vfs_init(struct bch_fs *c)
{
return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
}
static struct file_system_type bcache_fs_type = { static struct file_system_type bcache_fs_type = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.name = "bcachefs", .name = "bcachefs",

View File

@ -13,6 +13,9 @@
struct bch_inode_info { struct bch_inode_info {
struct inode v; struct inode v;
struct rhash_head hash;
subvol_inum ei_inum;
struct list_head ei_vfs_inode_list; struct list_head ei_vfs_inode_list;
unsigned long ei_flags; unsigned long ei_flags;
@ -24,8 +27,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock; struct mutex ei_quota_lock;
struct bch_qid ei_qid; struct bch_qid ei_qid;
u32 ei_subvol;
/* /*
* When we've been doing nocow writes we'll need to issue flushes to the * When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices * underlying block devices
@ -50,10 +51,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode) static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{ {
return (subvol_inum) { return inode->ei_inum;
.subvol = inode->ei_subvol,
.inum = inode->ei_inode.bi_inum,
};
} }
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
@ -69,6 +67,7 @@ struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
* those: * those:
*/ */
#define EI_INODE_SNAPSHOT 1 #define EI_INODE_SNAPSHOT 1
#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \ #define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v) container_of_or_null(_inode, struct bch_inode_info, v)
@ -189,6 +188,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
void bch2_fs_vfs_exit(struct bch_fs *);
int bch2_fs_vfs_init(struct bch_fs *);
void bch2_vfs_exit(void); void bch2_vfs_exit(void);
int bch2_vfs_init(void); int bch2_vfs_init(void);
@ -203,6 +205,10 @@ static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, su
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {} snapshot_id_list *s) {}
static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
static inline void bch2_vfs_exit(void) {} static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; } static inline int bch2_vfs_init(void) { return 0; }

View File

@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags) subvol_inum inum, unsigned flags)
{ {
int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
return ret; return ret;
} }

View File

@ -30,7 +30,8 @@ struct snapshot_table {
}; };
typedef struct { typedef struct {
u32 subvol; /* we can't have padding in this struct: */
u64 subvol;
u64 inum; u64 inum;
} subvol_inum; } subvol_inum;

View File

@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c); bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c); bch2_fs_fsio_exit(c);
bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c); bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c); bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c); bch2_fs_nocow_locking_exit(c);
@ -926,6 +927,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?: bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?: bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?: bch2_fs_ec_init(c) ?:
bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?: bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c); bch2_fs_fs_io_direct_init(c);

View File

@ -306,7 +306,7 @@ retry:
bch2_trans_begin(trans); bch2_trans_begin(trans);
iter = (struct btree_iter) { NULL }; iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
if (ret) if (ret)
goto err; goto err;