// SPDX-License-Identifier: GPL-2.0 #ifndef NO_BCACHEFS_FS #include "bcachefs.h" #include "acl.h" #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" #include "chardev.h" #include "dirent.h" #include "errcode.h" #include "extents.h" #include "fs.h" #include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fs-io-pagecache.h" #include "fsck.h" #include "inode.h" #include "io_read.h" #include "journal.h" #include "keylist.h" #include "quota.h" #include "snapshot.h" #include "super.h" #include "xattr.h" #include "trace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static struct kmem_cache *bch2_inode_cache; static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *, struct bch_subvolume *); void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { struct bch_fs *c = trans->c; BUG_ON(bi->bi_inum != inode->v.i_ino); bch2_assert_pos_locked(trans, BTREE_ID_inodes, POS(0, bi->bi_inum)); set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); inode->v.i_mode = bi->bi_mode; if (fields & ATTR_ATIME) inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); if (fields & ATTR_CTIME) inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); inode->ei_inode = *bi; bch2_inode_flags_to_vfs(inode); } int __must_check bch2_write_inode(struct bch_fs *c, struct bch_inode_info *inode, inode_set_fn set, void *p, unsigned fields) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; retry: bch2_trans_begin(trans); ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent) ?: (set ? set(trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); bch2_trans_put(trans); return ret < 0 ? ret : 0; } int bch2_fs_quota_transfer(struct bch_fs *c, struct bch_inode_info *inode, struct bch_qid new_qid, unsigned qtypes, enum quota_acct_mode mode) { unsigned i; int ret; qtypes &= enabled_qtypes(c); for (i = 0; i < QTYP_NR; i++) if (new_qid.q[i] == inode->ei_qid.q[i]) qtypes &= ~(1U << i); if (!qtypes) return 0; mutex_lock(&inode->ei_quota_lock); ret = bch2_quota_transfer(c, qtypes, new_qid, inode->ei_qid, inode->v.i_blocks + inode->ei_quota_reserved, mode); if (!ret) for (i = 0; i < QTYP_NR; i++) if (qtypes & (1 << i)) inode->ei_qid.q[i] = new_qid.q[i]; mutex_unlock(&inode->ei_quota_lock); return ret; } static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { return a.subvol == b.subvol && a.inum == b.inum; } static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; return jhash(&inum->inum, sizeof(inum->inum), seed); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) { const struct bch_inode_info *inode = data; return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); } static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { const struct bch_inode_info *inode = obj; const subvol_inum *v = arg->key; return !subvol_inum_eq(inode->ei_inum, *v); } static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), .hashfn = bch2_vfs_inode_hash_fn, .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; struct rhashtable *ht = &c->vfs_inodes_table; subvol_inum inum = (subvol_inum) { .inum = p.offset }; DARRAY(u32) subvols; int ret = 0; if (!test_bit(BCH_FS_started, &c->flags)) return false; darray_init(&subvols); restart_from_top: /* * Tweaked version of __rhashtable_lookup(); we need to get a list of * subvolumes in which the given inode number is open. * * For this to work, we don't include the subvolume ID in the key that * we hash - all inodes with the same inode number regardless of * subvolume will hash to the same slot. * * This will be less than ideal if the same file is ever open * simultaneously in many different snapshots: */ rcu_read_lock(); struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { if (inode->ei_inum.inum == inum.inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { rcu_read_unlock(); ret = darray_make_room(&subvols, 1); if (ret) goto err; subvols.nr = 0; goto restart_from_top; } } } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); darray_for_each(subvols, i) { u32 snap; ret = bch2_subvolume_get_snapshot(trans, *i, &snap); if (ret) goto err; ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); if (ret) break; } err: darray_exit(&subvols); return ret; } static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } static void __wait_on_freeing_inode(struct bch_fs *c, struct bch_inode_info *inode, subvol_inum inum) { wait_queue_head_t *wq; struct wait_bit_queue_entry wait; wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); if (__bch2_inode_hash_find(c, inum) == inode) schedule_timeout(HZ * 10); finish_wait(wq, &wait.wq_entry); } static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, subvol_inum inum) { struct bch_inode_info *inode; repeat: inode = __bch2_inode_hash_find(c, inum); if (inode) { spin_lock(&inode->v.i_lock); if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { spin_unlock(&inode->v.i_lock); return NULL; } if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { bch2_trans_unlock(trans); __wait_on_freeing_inode(c, inode, inum); int ret = bch2_trans_relock(trans); if (ret) return ERR_PTR(ret); } goto repeat; } __iget(&inode->v); spin_unlock(&inode->v.i_lock); } return inode; } static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) { spin_lock(&inode->v.i_lock); bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); spin_unlock(&inode->v.i_lock); if (remove) { int ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; /* * This pairs with the bch2_inode_hash_find() -> * __wait_on_freeing_inode() path */ inode_wake_up_bit(&inode->v, __I_NEW); } } static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct btree_trans *trans, struct bch_inode_info *inode) { struct bch_inode_info *old = inode; set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); if (!old) goto retry; clear_bit(EI_INODE_HASHED, &inode->ei_flags); /* * bcachefs doesn't use I_NEW; we have no use for it since we * only insert fully created inodes in the inode hash table. But * discard_new_inode() expects it to be set... */ inode->v.i_state |= I_NEW; /* * We don't want bch2_evict_inode() to delete the inode on disk, * we just raced and had another inode in cache. Normally new * inodes don't have nlink == 0 - except tmpfiles do... */ set_nlink(&inode->v, 1); discard_new_inode(&inode->v); return old; } else { inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); return inode; } } #define memalloc_flags_do(_flags, _do) \ ({ \ unsigned _saved_flags = memalloc_flags_save(_flags); \ typeof(_do) _ret = _do; \ memalloc_noreclaim_restore(_saved_flags); \ _ret; \ }) static struct inode *bch2_alloc_inode(struct super_block *sb) { BUG(); } static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp) { struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, bch2_inode_cache, gfp); if (!inode) return NULL; inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); INIT_LIST_HEAD(&inode->ei_vfs_inode_list); inode->ei_flags = 0; mutex_init(&inode->ei_quota_lock); memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) { kmem_cache_free(bch2_inode_cache, inode); return NULL; } return inode; } /* * Allocate a new inode, dropping/retaking btree locks if necessary: */ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) { struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT); if (unlikely(!inode)) { int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM); if (ret && inode) { __destroy_inode(&inode->v); kmem_cache_free(bch2_inode_cache, inode); } if (ret) return ERR_PTR(ret); } return inode; } static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { struct bch_inode_info *inode = bch2_new_inode(trans); if (IS_ERR(inode)) return inode; bch2_vfs_inode_init(trans, inum, inode, bi, subvol); return bch2_inode_hash_insert(trans->c, trans, inode); } struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); if (inode) return &inode->v; struct btree_trans *trans = bch2_trans_get(c); struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, subvol_inum snapshot_src, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; kuid_t kuid; kgid_t kgid; int ret; /* * preallocate acls + vfs inode before btree transaction, so that * nothing can fail after the transaction succeeds: */ #ifdef CONFIG_BCACHEFS_POSIX_ACL ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); if (ret) return ERR_PTR(ret); #endif inode = __bch2_new_inode(c, GFP_NOFS); if (unlikely(!inode)) { inode = ERR_PTR(-ENOMEM); goto err; } bch2_inode_init_early(c, &inode_u); if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, from_kuid(i_user_ns(&dir->v), kuid), from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_before_quota; inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, BTREE_ITER_with_updates, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); err_before_quota: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; goto err_trans; } if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&dir->ei_update_lock); } bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); /* * we must insert the new inode into the inode cache before calling * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: * * also, calling bch2_inode_hash_insert() without passing in the * transaction object is sketchy - if we could ever end up in * __wait_on_freeing_inode(), we'd risk deadlock. * * But that shouldn't be possible, since we still have the inode locked * that we just created, and we _really_ can't take a transaction * restart here. */ inode = bch2_inode_hash_insert(c, NULL, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); posix_acl_release(acl); return inode; err_trans: if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_put(trans); make_bad_inode(&inode->v); iput(&inode->v); inode = ERR_PTR(ret); goto err; } /* methods */ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, subvol_inum dir, struct bch_hash_info *dir_hash_info, const struct qstr *name) { struct bch_fs *c = trans->c; struct btree_iter dirent_iter = {}; subvol_inum inum = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, dir_hash_info, dir, name, 0); int ret = bkey_err(k); if (ret) return ERR_PTR(ret); ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); if (ret > 0) ret = -ENOENT; if (ret) goto err; struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); if (inode) goto out; struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; /* regular files may have hardlinks: */ if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), c, "dirent points to inode that does not point back:\n %s", (bch2_bkey_val_to_text(&buf, c, k), prt_printf(&buf, "\n "), bch2_inode_unpacked_to_text(&buf, &inode_u), buf.buf))) { ret = -ENOENT; goto err; } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); return inode; err: inode = ERR_PTR(ret); goto out; } static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct bch_inode_info *inode; bch2_trans_do(c, PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), &hash, &dentry->d_name))); if (IS_ERR(inode)) inode = NULL; return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); d_instantiate(dentry, &inode->v); return 0; } static int bch2_create(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, struct bch_inode_info *inode, struct bch_inode_info *dir, struct dentry *dentry) { struct bch_inode_unpacked dir_u, inode_u; int ret; mutex_lock(&inode->ei_update_lock); struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, 0, bch2_link_trans(trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_put(trans); mutex_unlock(&inode->ei_update_lock); return ret; } static int bch2_link(struct dentry *old_dentry, struct inode *vdir, struct dentry *dentry) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); int ret; lockdep_assert_held(&inode->v.i_rwsem); ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); return 0; } int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bool deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_inode_unpacked dir_u, inode_u; int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); struct btree_trans *trans = bch2_trans_get(c); ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_unlink_trans(trans, inode_inum(dir), &dir_u, &inode_u, &dentry->d_name, deleting_snapshot)); if (unlikely(ret)) goto err; bch2_inode_update_after_write(trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_MTIME); if (inode_u.bi_subvol) { /* * Subvolume deletion is asynchronous, but we still want to tell * the VFS that it's been deleted here: */ set_nlink(&inode->v, 0); } err: bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; } static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, const char *symname) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); inode_lock(&inode->v); ret = page_symlink(&inode->v, symname, strlen(symname) + 1); inode_unlock(&inode->v); if (unlikely(ret)) goto err; ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); if (unlikely(ret)) goto err; ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; d_instantiate(dentry, &inode->v); return 0; err: iput(&inode->v); return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode) { return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); } static int bch2_rename2(struct mnt_idmap *idmap, struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { struct bch_fs *c = src_vdir->i_sb->s_fs_info; struct bch_inode_info *src_dir = to_bch_ei(src_vdir); struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 0, LLONG_MAX); if (ret) return ret; } bch2_lock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); trans = bch2_trans_get(c); ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) goto err_tx_restart; if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, src_inode, dst_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } if (mode == BCH_RENAME_EXCHANGE && inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, dst_inode, src_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } retry: bch2_trans_begin(trans); ret = bch2_rename_trans(trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, &dst_dentry->d_name, mode); if (unlikely(ret)) goto err_tx_restart; if (whiteout) { whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); ret = PTR_ERR_OR_ZERO(whiteout_inode_u); if (unlikely(ret)) goto err_tx_restart; bch2_inode_init_early(c, whiteout_inode_u); ret = bch2_create_trans(trans, inode_inum(src_dir), &src_dir_u, whiteout_inode_u, &src_dentry->d_name, from_kuid(i_user_ns(&src_dir->v), current_fsuid()), from_kgid(i_user_ns(&src_dir->v), current_fsgid()), S_IFCHR|WHITEOUT_MODE, 0, NULL, NULL, (subvol_inum) { 0 }, 0) ?: bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_tx_restart; } ret = bch2_trans_commit(trans, NULL, NULL, 0); if (unlikely(ret)) { err_tx_restart: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; goto err; } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); bch2_inode_update_after_write(trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); if (src_dir != dst_dir) bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: bch2_trans_put(trans); bch2_fs_quota_transfer(c, src_inode, bch_qid(&src_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); if (dst_inode) bch2_fs_quota_transfer(c, dst_inode, bch_qid(&dst_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); bch2_unlock_inodes(INODE_UPDATE_LOCK, src_dir, dst_dir, src_inode, dst_inode); return bch2_err_class(ret); } static void bch2_setattr_copy(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; kuid_t kuid; kgid_t kgid; if (ia_valid & ATTR_UID) { kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); } if (ia_valid & ATTR_GID) { kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); if (ia_valid & ATTR_MTIME) bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); if (ia_valid & ATTR_CTIME) bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID ? kgid : inode->v.i_gid; if (!in_group_or_capable(idmap, &inode->v, make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } } int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans *trans; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; kuid_t kuid; kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; if (attr->ia_valid & ATTR_UID) { kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); } if (attr->ia_valid & ATTR_GID) { kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); kfree(acl); acl = NULL; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) goto btree_err; bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); btree_err: bch2_trans_iter_exit(trans, &inode_iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; if (unlikely(ret)) goto err_trans; bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); err_trans: bch2_trans_put(trans); err: mutex_unlock(&inode->ei_update_lock); return bch2_err_class(ret); } static int bch2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); stat->mtime = inode_get_mtime(&inode->v); stat->ctime = inode_get_ctime(&inode->v); stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; stat->subvol = inode->ei_inum.subvol; stat->result_mask |= STATX_SUBVOL; if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { stat->result_mask |= STATX_DIOALIGN; /* * this is incorrect; we should be tracking this in superblock, * and checking the alignment of open devices */ stat->dio_mem_align = SECTOR_SIZE; stat->dio_offset_align = block_bytes(c); } if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); } if (inode->ei_inode.bi_flags & BCH_INODE_immutable) stat->attributes |= STATX_ATTR_IMMUTABLE; stat->attributes_mask |= STATX_ATTR_IMMUTABLE; if (inode->ei_inode.bi_flags & BCH_INODE_append) stat->attributes |= STATX_ATTR_APPEND; stat->attributes_mask |= STATX_ATTR_APPEND; if (inode->ei_inode.bi_flags & BCH_INODE_nodump) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_NODUMP; return 0; } static int bch2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret; lockdep_assert_held(&inode->v.i_rwsem); ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE ? bchfs_truncate(idmap, inode, iattr) : bch2_setattr_nonsize(idmap, inode, iattr); } static int bch2_tmpfile(struct mnt_idmap *idmap, struct inode *vdir, struct file *file, umode_t mode) { struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), file->f_path.dentry, mode, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); d_mark_tmpfile(file, &inode->v); d_instantiate(file->f_path.dentry, &inode->v); return finish_open_simple(file, 0); } static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, struct bkey_s_c k, unsigned flags) { if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; int ret; if (k.k->type == KEY_TYPE_reflink_v) flags |= FIEMAP_EXTENT_SHARED; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; if (p.ptr.unwritten) flags2 |= FIEMAP_EXTENT_UNWRITTEN; if (p.crc.compression_type) flags2 |= FIEMAP_EXTENT_ENCODED; else offset += p.crc.offset; if ((offset & (block_sectors(c) - 1)) || (k.k->size & (block_sectors(c) - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, offset << 9, k.k->size << 9, flags|flags2); if (ret) return ret; } return 0; } else if (bkey_extent_is_inline_data(k.k)) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, flags| FIEMAP_EXTENT_DATA_INLINE); } else if (k.k->type == KEY_TYPE_reservation) { return fiemap_fill_next_extent(info, bkey_start_offset(k.k) << 9, 0, k.k->size << 9, flags| FIEMAP_EXTENT_DELALLOC| FIEMAP_EXTENT_UNWRITTEN); } else { BUG(); } } static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; start >>= 9; bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(ei->v.i_ino, start), 0); while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); if (ret) continue; bch2_btree_iter_set_snapshot(&iter, snapshot); k = bch2_btree_iter_peek_upto(&iter, end); ret = bkey_err(k); if (ret) continue; if (!k.k) break; if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(&iter); continue; } offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&cur, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &cur); if (ret) continue; k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); sectors = min(sectors, k.k->size - offset_into_extent); bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + offset_into_extent), cur.k); bch2_key_resize(&cur.k->k, sectors); cur.k->k.p = iter.pos; cur.k->k.p.offset += cur.k->k.size; if (have_extent) { bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 0); if (ret) break; } bkey_copy(prev.k, cur.k); have_extent = true; bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } bch2_trans_iter_exit(trans, &iter); if (!ret && have_extent) { bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); } bch2_trans_put(trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; } static const struct vm_operations_struct bch_vm_ops = { .fault = bch2_page_fault, .map_pages = filemap_map_pages, .page_mkwrite = bch2_page_mkwrite, }; static int bch2_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &bch_vm_ops; return 0; } /* Directories: */ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) { return generic_file_llseek_size(file, offset, whence, S64_MAX, S64_MAX); } static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; if (!dir_emit_dots(file, ctx)) return 0; int ret = bch2_readdir(c, inode_inum(inode), ctx); bch_err_fn(c, ret); return bch2_err_class(ret); } static int bch2_open(struct inode *vinode, struct file *file) { if (file->f_flags & (O_WRONLY|O_RDWR)) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(vinode, file); } static const struct file_operations bch_file_operations = { .open = bch2_open, .llseek = bch2_llseek, .read_iter = bch2_read_iter, .write_iter = bch2_write_iter, .mmap = bch2_mmap, .get_unmapped_area = thp_get_unmapped_area, .fsync = bch2_fsync, .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .fiemap = bch2_fiemap, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct inode_operations bch_dir_inode_operations = { .lookup = bch2_lookup, .create = bch2_create, .link = bch2_link, .unlink = bch2_unlink, .symlink = bch2_symlink, .mkdir = bch2_mkdir, .rmdir = bch2_unlink, .mknod = bch2_mknod, .rename = bch2_rename2, .getattr = bch2_getattr, .setattr = bch2_setattr, .tmpfile = bch2_tmpfile, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct file_operations bch_dir_file_operations = { .llseek = bch2_dir_llseek, .read = generic_read_dir, .iterate_shared = bch2_vfs_readdir, .fsync = bch2_fsync, .unlocked_ioctl = bch2_fs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif }; static const struct inode_operations bch_symlink_inode_operations = { .get_link = page_get_link, .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct inode_operations bch_special_inode_operations = { .getattr = bch2_getattr, .setattr = bch2_setattr, .listxattr = bch2_xattr_list, #ifdef CONFIG_BCACHEFS_POSIX_ACL .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif }; static const struct address_space_operations bch_address_space_operations = { .read_folio = bch2_read_folio, .writepages = bch2_writepages, .readahead = bch2_readahead, .dirty_folio = filemap_dirty_folio, .write_begin = bch2_write_begin, .write_end = bch2_write_end, .invalidate_folio = bch2_invalidate_folio, .release_folio = bch2_release_folio, #ifdef CONFIG_MIGRATION .migrate_folio = filemap_migrate_folio, #endif .error_remove_folio = generic_error_remove_folio, }; struct bcachefs_fid { u64 inum; u32 subvol; u32 gen; } __packed; struct bcachefs_fid_with_parent { struct bcachefs_fid fid; struct bcachefs_fid dir; } __packed; static int bcachefs_fid_valid(int fh_len, int fh_type) { switch (fh_type) { case FILEID_BCACHEFS_WITHOUT_PARENT: return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); case FILEID_BCACHEFS_WITH_PARENT: return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); default: return false; } } static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { .inum = inode->ei_inum.inum, .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, struct inode *vdir) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *dir = to_bch_ei(vdir); int min_len; if (!S_ISDIR(inode->v.i_mode) && dir) { struct bcachefs_fid_with_parent *fid = (void *) fh; min_len = sizeof(*fid) / sizeof(u32); if (*len < min_len) { *len = min_len; return FILEID_INVALID; } fid->fid = bch2_inode_to_fid(inode); fid->dir = bch2_inode_to_fid(dir); *len = min_len; return FILEID_BCACHEFS_WITH_PARENT; } else { struct bcachefs_fid *fid = (void *) fh; min_len = sizeof(*fid) / sizeof(u32); if (*len < min_len) { *len = min_len; return FILEID_INVALID; } *fid = bch2_inode_to_fid(inode); *len = min_len; return FILEID_BCACHEFS_WITHOUT_PARENT; } } static struct inode *bch2_nfs_get_inode(struct super_block *sb, struct bcachefs_fid fid) { struct bch_fs *c = sb->s_fs_info; struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { .subvol = fid.subvol, .inum = fid.inum, }); if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { iput(vinode); vinode = ERR_PTR(-ESTALE); } return vinode; } static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { struct bcachefs_fid *fid = (void *) _fid; if (!bcachefs_fid_valid(fh_len, fh_type)) return NULL; return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); } static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { struct bcachefs_fid_with_parent *fid = (void *) _fid; if (!bcachefs_fid_valid(fh_len, fh_type) || fh_type != FILEID_BCACHEFS_WITH_PARENT) return NULL; return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); } static struct dentry *bch2_get_parent(struct dentry *child) { struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); } static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) { struct bch_inode_info *inode = to_bch_ei(child->d_inode); struct bch_inode_info *dir = to_bch_ei(parent->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans *trans; struct btree_iter iter1; struct btree_iter iter2; struct bkey_s_c k; struct bkey_s_c_dirent d; struct bch_inode_unpacked inode_u; subvol_inum target; u32 snapshot; struct qstr dirent_name; unsigned name_len = 0; int ret; if (!S_ISDIR(dir->v.i_mode)) return -EINVAL; trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, POS(dir->ei_inode.bi_inum, 0), 0); retry: bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(&iter1, snapshot); bch2_btree_iter_set_snapshot(&iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); k = bch2_btree_iter_peek_slot(&iter1); ret = bkey_err(k); if (ret) goto err; if (k.k->type != KEY_TYPE_dirent) { ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; if (ret) goto err; if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ for_each_btree_key_continue_norestart(iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; if (k.k->type != KEY_TYPE_dirent) continue; d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret < 0) break; if (ret) continue; if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } ret = -ENOENT; goto err; found: dirent_name = bch2_dirent_get_name(d); name_len = min_t(unsigned, dirent_name.len, NAME_MAX); memcpy(name, dirent_name.name, name_len); name[name_len] = '\0'; err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_iter_exit(trans, &iter1); bch2_trans_iter_exit(trans, &iter2); bch2_trans_put(trans); return ret; } static const struct export_operations bch_export_ops = { .encode_fh = bch2_encode_fh, .fh_to_dentry = bch2_fh_to_dentry, .fh_to_parent = bch2_fh_to_parent, .get_parent = bch2_get_parent, .get_name = bch2_get_name, }; static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { inode->v.i_ino = inum.inum; inode->ei_inum = inum; inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); if (BCH_SUBVOLUME_SNAP(subvol)) set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { case S_IFREG: inode->v.i_op = &bch_file_inode_operations; inode->v.i_fop = &bch_file_operations; break; case S_IFDIR: inode->v.i_op = &bch_dir_inode_operations; inode->v.i_fop = &bch_dir_file_operations; break; case S_IFLNK: inode_nohighmem(&inode->v); inode->v.i_op = &bch_symlink_inode_operations; break; default: init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); inode->v.i_op = &bch_special_inode_operations; break; } mapping_set_large_folios(inode->v.i_mapping); } static void bch2_free_inode(struct inode *vinode) { kmem_cache_free(bch2_inode_cache, to_bch_ei(vinode)); } static int inode_update_times_fn(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); return 0; } static int bch2_vfs_write_inode(struct inode *vinode, struct writeback_control *wbc) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); int ret; mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); return bch2_err_class(ret); } static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); /* * evict() has waited for outstanding writeback, we'll do no more IO * through this inode: it's safe to remove from VFS inode hashtable here * * Do that now so that other threads aren't blocked from pulling it back * in, there's no reason for them to be: */ if (!delete) bch2_inode_hash_remove(c, inode); truncate_inode_pages_final(&inode->v.i_data); clear_inode(&inode->v); BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); if (delete) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); /* * If we are deleting, we need it present in the vfs hash table * so that fsck can check if unlinked inodes are still open: */ bch2_inode_hash_remove(c, inode); } mutex_lock(&c->vfs_inodes_lock); list_del_init(&inode->ei_vfs_inode_list); mutex_unlock(&c->vfs_inodes_lock); } void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { struct bch_inode_info *inode; DARRAY(struct bch_inode_info *) grabbed; bool clean_pass = false, this_pass_clean; /* * Initially, we scan for inodes without I_DONTCACHE, then mark them to * be pruned with d_mark_dontcache(). * * Once we've had a clean pass where we didn't find any inodes without * I_DONTCACHE, we wait for them to be freed: */ darray_init(&grabbed); darray_make_room(&grabbed, 1024); again: cond_resched(); this_pass_clean = true; mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && !(inode->v.i_state & I_FREEING) && igrab(&inode->v)) { this_pass_clean = false; if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { iput(&inode->v); break; } } else if (clean_pass && this_pass_clean) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); goto again; } } mutex_unlock(&c->vfs_inodes_lock); darray_for_each(grabbed, i) { inode = *i; d_mark_dontcache(&inode->v); d_prune_aliases(&inode->v); iput(&inode->v); } grabbed.nr = 0; if (!clean_pass || !this_pass_clean) { clean_pass = this_pass_clean; goto again; } darray_exit(&grabbed); } static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct bch_fs *c = sb->s_fs_info; struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; /* * this assumes inodes take up 64 bytes, which is a decent average * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = usage.capacity >> shift; buf->f_bfree = usage.free >> shift; buf->f_bavail = avail_factor(usage.free) >> shift; buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; } static int bch2_sync_fs(struct super_block *sb, int wait) { struct bch_fs *c = sb->s_fs_info; int ret; trace_bch2_sync_fs(sb, wait); if (c->opts.journal_flush_disabled) return 0; if (!wait) { bch2_journal_flush_async(&c->journal, NULL); return 0; } ret = bch2_journal_flush(&c->journal); return bch2_err_class(ret); } static struct bch_fs *bch2_path_to_fs(const char *path) { struct bch_fs *c; dev_t dev; int ret; ret = lookup_bdev(path, &dev); if (ret) return ERR_PTR(ret); c = bch2_dev_to_fs(dev); if (c) closure_put(&c->cl); return c ?: ERR_PTR(-ENOENT); } static int bch2_remount(struct super_block *sb, int *flags, struct bch_opts opts) { struct bch_fs *c = sb->s_fs_info; int ret = 0; opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); if (opts.read_only != c->opts.read_only) { down_write(&c->state_lock); if (opts.read_only) { bch2_fs_read_only(c); sb->s_flags |= SB_RDONLY; } else { ret = bch2_fs_read_write(c); if (ret) { bch_err(c, "error going rw: %i", ret); up_write(&c->state_lock); ret = -EINVAL; goto err; } sb->s_flags &= ~SB_RDONLY; } c->opts.read_only = opts.read_only; up_write(&c->state_lock); } if (opt_defined(opts, errors)) c->opts.errors = opts.errors; err: return bch2_err_class(ret); } static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; for_each_online_member(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } return 0; } static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; struct printbuf buf = PRINTBUF; bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); printbuf_nul_terminate(&buf); seq_printf(seq, ",%s", buf.buf); int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } static void bch2_put_super(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; __bch2_fs_stop(c); } /* * bcachefs doesn't currently integrate intwrite freeze protection but the * internal write references serve the same purpose. Therefore reuse the * read-only transition code to perform the quiesce. The caveat is that we don't * currently have the ability to block tasks that want a write reference while * the superblock is frozen. This is fine for now, but we should either add * blocking support or find a way to integrate sb_start_intwrite() and friends. */ static int bch2_freeze(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); return 0; } static int bch2_unfreeze(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; int ret; if (test_bit(BCH_FS_emergency_ro, &c->flags)) return 0; down_write(&c->state_lock); ret = bch2_fs_read_write(c); up_write(&c->state_lock); return ret; } static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .free_inode = bch2_free_inode, .write_inode = bch2_vfs_write_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, .statfs = bch2_statfs, .show_devname = bch2_show_devname, .show_options = bch2_show_options, .put_super = bch2_put_super, .freeze_fs = bch2_freeze, .unfreeze_fs = bch2_unfreeze, }; static int bch2_set_super(struct super_block *s, void *data) { s->s_fs_info = data; return 0; } static int bch2_noset_super(struct super_block *s, void *data) { return -EBUSY; } typedef DARRAY(struct bch_fs *) darray_fs; static int bch2_test_super(struct super_block *s, void *data) { struct bch_fs *c = s->s_fs_info; darray_fs *d = data; if (!c) return false; darray_for_each(*d, i) if (c != *i) return false; return true; } static int bch2_fs_get_tree(struct fs_context *fc) { struct bch_fs *c; struct super_block *sb; struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; darray_str devs; darray_fs devs_to_fs = {}; int ret; opt_set(opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); opt_set(opts, nostart, true); if (!fc->source || strlen(fc->source) == 0) return -EINVAL; ret = bch2_split_devs(fc->source, &devs); if (ret) return ret; darray_for_each(devs, i) { ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i)); if (ret) goto err; } sb = sget(fc->fs_type, bch2_test_super, bch2_noset_super, fc->sb_flags|SB_NOSEC, &devs_to_fs); if (!IS_ERR(sb)) goto got_sb; c = bch2_fs_open(devs.data, devs.nr, opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); if (ret) goto err_stop_fs; bch2_opts_apply(&c->opts, opts); ret = bch2_fs_start(c); if (ret) goto err_stop_fs; sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); ret = PTR_ERR_OR_ZERO(sb); if (ret) goto err_stop_fs; got_sb: c = sb->s_fs_info; if (sb->s_root) { if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) { ret = -EBUSY; goto err_put_super; } goto out; } sb->s_blocksize = block_bytes(c); sb->s_blocksize_bits = ilog2(block_bytes(c)); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &bch_super_operations; sb->s_export_op = &bch_export_ops; #ifdef CONFIG_BCACHEFS_QUOTA sb->s_qcop = &bch2_quotactl_operations; sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; #endif sb->s_xattr = bch2_xattr_handlers; sb->s_magic = BCACHEFS_STATFS_MAGIC; sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); sb->s_uuid = c->sb.user_uuid; sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); ret = super_setup_bdi(sb); if (ret) goto err_put_super; sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; for_each_online_member(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; percpu_ref_put(&ca->io_ref); break; } c->dev = sb->s_dev; #ifdef CONFIG_BCACHEFS_POSIX_ACL if (c->opts.acl) sb->s_flags |= SB_POSIXACL; #endif sb->s_shrink->seeks = 0; vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); if (ret) goto err_put_super; sb->s_root = d_make_root(vinode); if (!sb->s_root) { bch_err(c, "error mounting: error allocating root dentry"); ret = -ENOMEM; goto err_put_super; } sb->s_flags |= SB_ACTIVE; out: fc->root = dget(sb->s_root); err: darray_exit(&devs_to_fs); bch2_darray_str_exit(&devs); if (ret) pr_err("error: %s", bch2_err_str(ret)); /* * On an inconsistency error in recovery we might see an -EROFS derived * errorcode (from the journal), but we don't want to return that to * userspace as that causes util-linux to retry the mount RO - which is * confusing: */ if (bch2_err_matches(ret, EROFS) && ret != -EROFS) ret = -EIO; return bch2_err_class(ret); err_stop_fs: bch2_fs_stop(c); goto err; err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); goto err; } static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; generic_shutdown_super(sb); bch2_fs_free(c); } static void bch2_fs_context_free(struct fs_context *fc) { struct bch2_opts_parse *opts = fc->fs_private; if (opts) { printbuf_exit(&opts->parse_later); kfree(opts); } } static int bch2_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { /* * the "source" param, i.e., the name of the device(s) to mount, * is handled by the VFS layer. */ if (!strcmp(param->key, "source")) return -ENOPARAM; struct bch2_opts_parse *opts = fc->fs_private; struct bch_fs *c = NULL; /* for reconfigure, we already have a struct bch_fs */ if (fc->root) c = fc->root->d_sb->s_fs_info; int ret = bch2_parse_one_mount_opt(c, &opts->opts, &opts->parse_later, param->key, param->string); return bch2_err_class(ret); } static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; return bch2_remount(sb, &fc->sb_flags, opts->opts); } static const struct fs_context_operations bch2_context_ops = { .free = bch2_fs_context_free, .parse_param = bch2_fs_parse_param, .get_tree = bch2_fs_get_tree, .reconfigure = bch2_fs_reconfigure, }; static int bch2_init_fs_context(struct fs_context *fc) { struct bch2_opts_parse *opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; opts->parse_later = PRINTBUF; fc->ops = &bch2_context_ops; fc->fs_private = opts; return 0; } void bch2_fs_vfs_exit(struct bch_fs *c) { if (c->vfs_inodes_table.tbl) rhashtable_destroy(&c->vfs_inodes_table); } int bch2_fs_vfs_init(struct bch_fs *c) { return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); } static struct file_system_type bcache_fs_type = { .owner = THIS_MODULE, .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("bcachefs"); void bch2_vfs_exit(void) { unregister_filesystem(&bcache_fs_type); kmem_cache_destroy(bch2_inode_cache); } int __init bch2_vfs_init(void) { int ret = -ENOMEM; bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; ret = register_filesystem(&bcache_fs_type); if (ret) goto err; return 0; err: bch2_vfs_exit(); return ret; } #endif /* NO_BCACHEFS_FS */