From d92d88f0568e97c437eeb79d9c9609bd8277406f Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:50 +0200 Subject: [PATCH 01/15] fuse: Fix crash in fuse_dentry_automount() error path If fuse_fill_super_submount() returns an error, the error path triggers a crash: [ 26.206673] BUG: kernel NULL pointer dereference, address: 0000000000000000 [...] [ 26.226362] RIP: 0010:__list_del_entry_valid+0x25/0x90 [...] [ 26.247938] Call Trace: [ 26.248300] fuse_mount_remove+0x2c/0x70 [fuse] [ 26.248892] virtio_kill_sb+0x22/0x160 [virtiofs] [ 26.249487] deactivate_locked_super+0x36/0xa0 [ 26.250077] fuse_dentry_automount+0x178/0x1a0 [fuse] The crash happens because fuse_mount_remove() assumes that the FUSE mount was already added to list under the FUSE connection, but this only done after fuse_fill_super_submount() has returned success. This means that until fuse_fill_super_submount() has returned success, the FUSE mount isn't actually owned by the superblock. We should thus reclaim ownership by clearing sb->s_fs_info, which will skip the call to fuse_mount_remove(), and perform rollback, like virtio_fs_get_tree() already does for the root sb. Fixes: bf109c64040f ("fuse: implement crossmounts") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1b6c001a7dd1..01559061cbfb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -339,8 +339,12 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) /* Initialize superblock, making @mp_fi its root */ err = fuse_fill_super_submount(sb, mp_fi); - if (err) + if (err) { + fuse_conn_put(fc); + kfree(fm); + sb->s_fs_info = NULL; goto out_put_sb; + } sb->s_flags |= SB_ACTIVE; fsc->root = dget(sb->s_root); From e3a43f2a95393000778f8f302d48795add2fc4a8 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:51 +0200 Subject: [PATCH 02/15] fuse: Fix crash if superblock of submount gets killed early As soon as fuse_dentry_automount() does up_write(&sb->s_umount), the superblock can theoretically be killed. If this happens before the submount was added to the &fc->mounts list, fuse_mount_remove() later crashes in list_del_init() because it assumes the submount to be already there. Add the submount before dropping sb->s_umount to fix the inconsistency. It is okay to nest fc->killsb under sb->s_umount, we already do this on the ->kill_sb() path. Signed-off-by: Greg Kurz Fixes: bf109c64040f ("fuse: implement crossmounts") Cc: stable@vger.kernel.org # v5.10+ Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 01559061cbfb..3fd1b71e546b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -346,15 +346,15 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) goto out_put_sb; } + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + sb->s_flags |= SB_ACTIVE; fsc->root = dget(sb->s_root); /* We are done configuring the superblock, so unlock it */ up_write(&sb->s_umount); - down_write(&fc->killsb); - list_add_tail(&fm->fc_entry, &fc->mounts); - up_write(&fc->killsb); - /* Create the submount */ mnt = vfs_create_mount(fsc); if (IS_ERR(mnt)) { From e4a9ccdd1c03b3dc58214874399d24331ea0a3ab Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:52 +0200 Subject: [PATCH 03/15] fuse: Fix infinite loop in sget_fc() We don't set the SB_BORN flag on submounts. This is wrong as these superblocks are then considered as partially constructed or dying in the rest of the code and can break some assumptions. One such case is when you have a virtiofs filesystem with submounts and you try to mount it again : virtio_fs_get_tree() tries to obtain a superblock with sget_fc(). The logic in sget_fc() is to loop until it has either found an existing matching superblock with SB_BORN set or to create a brand new one. It is assumed that a superblock without SB_BORN is transient and the loop is restarted. Forgetting to set SB_BORN on submounts hence causes sget_fc() to retry forever. Setting SB_BORN requires special care, i.e. a write barrier for super_cache_count() which can check SB_BORN without taking any lock. We should call vfs_get_tree() to deal with that but this requires to have a proper ->get_tree() implementation for submounts, which is a bigger piece of work. Go for a simple bug fix in the meatime. Fixes: bf109c64040f ("fuse: implement crossmounts") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3fd1b71e546b..3fa8604c21d5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -352,6 +352,17 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) sb->s_flags |= SB_ACTIVE; fsc->root = dget(sb->s_root); + + /* + * FIXME: setting SB_BORN requires a write barrier for + * super_cache_count(). We should actually come + * up with a proper ->get_tree() implementation + * for submounts and call vfs_get_tree() to take + * care of the write barrier. + */ + smp_wmb(); + sb->s_flags |= SB_BORN; + /* We are done configuring the superblock, so unlock it */ up_write(&sb->s_umount); From b89ecd60d38ec042d63bdb376c722a16f92bcb88 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 18 Jun 2021 21:16:42 +0200 Subject: [PATCH 04/15] fuse: ignore PG_workingset after stealing Fix the "fuse: trying to steal weird page" warning. Description from Johannes Weiner: "Think of it as similar to PG_active. It's just another usage/heat indicator of file and anon pages on the reclaim LRU that, unlike PG_active, persists across deactivation and even reclaim (we store it in the page cache / swapper cache tree until the page refaults). So if fuse accepts pages that can legally have PG_active set, PG_workingset is fine too." Reported-by: Thomas Lindroth Fixes: 1899ad18c607 ("mm: workingset: tell cache transitions from workingset thrashing") Cc: # v4.20 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5ceccc5ef00..817a0b1c5c25 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -783,6 +783,7 @@ static int fuse_check_page(struct page *page) 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | + 1 << PG_workingset | 1 << PG_reclaim | 1 << PG_waiters))) { dump_page(page, "fuse: trying to steal weird page"); From 80ef08670d4c28a06a3de954bd350368780bcfef Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 22 Jun 2021 09:15:35 +0200 Subject: [PATCH 05/15] fuse: check connected before queueing on fpq->io A request could end up on the fpq->io list after fuse_abort_conn() has reset fpq->connected and aborted requests on that list: Thread-1 Thread-2 ======== ======== ->fuse_simple_request() ->shutdown ->__fuse_request_send() ->queue_request() ->fuse_abort_conn() ->fuse_dev_do_read() ->acquire(fpq->lock) ->wait_for(fpq->lock) ->set err to all req's in fpq->io ->release(fpq->lock) ->acquire(fpq->lock) ->add req to fpq->io After the userspace copy is done the request will be ended, but req->out.h.error will remain uninitialized. Also the copy might block despite being already aborted. Fix both issues by not allowing the request to be queued on the fpq->io list after fuse_abort_conn() has processed this list. Reported-by: Pradeep P V K Fixes: fd22d62ed0c3 ("fuse: no fc->lock for iqueue parts") Cc: # v4.2 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 817a0b1c5c25..6e63bcba2a40 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1272,6 +1272,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto restart; } spin_lock(&fpq->lock); + /* + * Must not put request on fpq->io queue after having been shut down by + * fuse_abort_conn() + */ + if (!fpq->connected) { + req->out.h.error = err = -ECONNABORTED; + goto out_end; + + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; From 49221cf86d18bb66fe95d3338cb33bd4b9880ca5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 22 Jun 2021 09:15:35 +0200 Subject: [PATCH 06/15] fuse: reject internal errno Don't allow userspace to report errors that could be kernel-internal. Reported-by: Anatoly Trosinenko Fixes: 334f485df85a ("[PATCH] FUSE - device functions") Cc: # v2.6.14 Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6e63bcba2a40..b8d58aa08206 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1867,7 +1867,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, } err = -EINVAL; - if (oh.error <= -1000 || oh.error > 0) + if (oh.error <= -512 || oh.error > 0) goto copy_finish; spin_lock(&fpq->lock); From 2d82ab251ef0f6e7716279b04e9b5a01a86ca530 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 20 May 2021 17:46:54 +0200 Subject: [PATCH 07/15] virtiofs: propagate sync() to file server Even if POSIX doesn't mandate it, linux users legitimately expect sync() to flush all data and metadata to physical storage when it is located on the same system. This isn't happening with virtiofs though: sync() inside the guest returns right away even though data still needs to be flushed from the host page cache. This is easily demonstrated by doing the following in the guest: $ dd if=/dev/zero of=/mnt/foo bs=1M count=5K ; strace -T -e sync sync 5120+0 records in 5120+0 records out 5368709120 bytes (5.4 GB, 5.0 GiB) copied, 5.22224 s, 1.0 GB/s sync() = 0 <0.024068> and start the following in the host when the 'dd' command completes in the guest: $ strace -T -e fsync /usr/bin/sync virtiofs/foo fsync(3) = 0 <10.371640> There are no good reasons not to honor the expected behavior of sync() actually: it gives an unrealistic impression that virtiofs is super fast and that data has safely landed on HW, which isn't the case obviously. Implement a ->sync_fs() superblock operation that sends a new FUSE_SYNCFS request type for this purpose. Provision a 64-bit placeholder for possible future extensions. Since the file server cannot handle the wait == 0 case, we skip it to avoid a gratuitous roundtrip. Note that this is per-superblock: a FUSE_SYNCFS is send for the root mount and for each submount. Like with FUSE_FSYNC and FUSE_FSYNCDIR, lack of support for FUSE_SYNCFS in the file server is treated as permanent success. This ensures compatibility with older file servers: the client will get the current behavior of sync() not being propagated to the file server. Note that such an operation allows the file server to DoS sync(). Since a typical FUSE file server is an untrusted piece of software running in userspace, this is disabled by default. Only enable it with virtiofs for now since virtiofsd is supposedly trusted by the guest kernel. Reported-by: Robert Krawitz Signed-off-by: Greg Kurz Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 40 +++++++++++++++++++++++++++++++++++++++ fs/fuse/virtio_fs.c | 1 + include/uapi/linux/fuse.h | 10 +++++++++- 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7e463e220053..f48dd7ff32af 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -761,6 +761,9 @@ struct fuse_conn { /* Auto-mount submounts announced by the server */ unsigned int auto_submounts:1; + /* Propagate syncfs() to server */ + unsigned int sync_fs:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 393e36b74dc4..93d28dc1f572 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -506,6 +506,45 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + enum { OPT_SOURCE, OPT_SUBTYPE, @@ -909,6 +948,7 @@ static const struct super_operations fuse_super_operations = { .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bcb8a02e2d8b..f9809b1b82f0 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1447,6 +1447,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; + fc->sync_fs = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 271ae90a9bb7..36ed092227fa 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -181,6 +181,9 @@ * - add FUSE_OPEN_KILL_SUIDGID * - extend fuse_setxattr_in, add FUSE_SETXATTR_EXT * - add FUSE_SETXATTR_ACL_KILL_SGID + * + * 7.34 + * - add FUSE_SYNCFS */ #ifndef _LINUX_FUSE_H @@ -216,7 +219,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 33 +#define FUSE_KERNEL_MINOR_VERSION 34 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -509,6 +512,7 @@ enum fuse_opcode { FUSE_COPY_FILE_RANGE = 47, FUSE_SETUPMAPPING = 48, FUSE_REMOVEMAPPING = 49, + FUSE_SYNCFS = 50, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -971,4 +975,8 @@ struct fuse_removemapping_one { #define FUSE_REMOVEMAPPING_MAX_ENTRY \ (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) +struct fuse_syncfs_in { + uint64_t padding; +}; + #endif /* _LINUX_FUSE_H */ From fe0a7bd81bfefe5eb73bce55682586c6c266e21e Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:53 +0200 Subject: [PATCH 08/15] fuse: add dedicated filesystem context ops for submounts The creation of a submount is open-coded in fuse_dentry_automount(). This brings a lot of complexity and we recently had to fix bugs because we weren't setting SB_BORN or because we were unlocking sb->s_umount before sb was fully configured. Most of these could have been avoided by using the mount API instead of open-coding. Basically, this means coming up with a proper ->get_tree() implementation for submounts and call vfs_get_tree(), or better fc_mount(). The creation of the superblock for submounts is quite different from the root mount. Especially, it doesn't require to allocate a FUSE filesystem context, nor to parse parameters. Introduce a dedicated context ops for submounts to make this clear. This is just a placeholder for now, fuse_get_tree_submount() will be populated in a subsequent patch. Only visible change is that we stop allocating/freeing a useless FUSE filesystem context with submounts. Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 5 +++++ fs/fuse/inode.c | 16 ++++++++++++++++ fs/fuse/virtio_fs.c | 3 +++ 3 files changed, 24 insertions(+) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f48dd7ff32af..b5957c8274dd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1100,6 +1100,11 @@ int fuse_fill_super_submount(struct super_block *sb, */ bool fuse_mount_remove(struct fuse_mount *fm); +/* + * Setup context ops for submounts + */ +int fuse_init_fs_context_submount(struct fs_context *fsc); + /* * Shut down the connection (possibly sending DESTROY request). */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 93d28dc1f572..a0a8228979a2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1353,6 +1353,22 @@ int fuse_fill_super_submount(struct super_block *sb, return 0; } +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index f9809b1b82f0..8f52cdaa8445 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1497,6 +1497,9 @@ static int virtio_fs_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; + if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) + return fuse_init_fs_context_submount(fsc); + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; From 266eb3f2fae488fd19ee5acfc01ba9d483715699 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:54 +0200 Subject: [PATCH 09/15] fuse: Call vfs_get_tree() for submounts We recently fixed an infinite loop by setting the SB_BORN flag on submounts along with the write barrier needed by super_cache_count(). This is the job of vfs_get_tree() and FUSE shouldn't have to care about the barrier at all. Split out some code from fuse_dentry_automount() to the dedicated fuse_get_tree_submount() handler for submounts and call vfs_get_tree(). Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 55 ++++++------------------------------------------- fs/fuse/inode.c | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3fa8604c21d5..b88e5785a3dd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -309,12 +309,8 @@ static int fuse_dentry_delete(const struct dentry *dentry) static struct vfsmount *fuse_dentry_automount(struct path *path) { struct fs_context *fsc; - struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); - struct fuse_conn *fc = parent_fm->fc; - struct fuse_mount *fm; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); - struct super_block *sb; int err; fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); @@ -323,48 +319,15 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) goto out; } - err = -ENOMEM; - fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); - if (!fm) + /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ + fsc->fs_private = mp_fi; + + err = vfs_get_tree(fsc); + if (err) goto out_put_fsc; - fsc->s_fs_info = fm; - sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - err = PTR_ERR(sb); - kfree(fm); - goto out_put_fsc; - } - fm->fc = fuse_conn_get(fc); - - /* Initialize superblock, making @mp_fi its root */ - err = fuse_fill_super_submount(sb, mp_fi); - if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; - goto out_put_sb; - } - - down_write(&fc->killsb); - list_add_tail(&fm->fc_entry, &fc->mounts); - up_write(&fc->killsb); - - sb->s_flags |= SB_ACTIVE; - fsc->root = dget(sb->s_root); - - /* - * FIXME: setting SB_BORN requires a write barrier for - * super_cache_count(). We should actually come - * up with a proper ->get_tree() implementation - * for submounts and call vfs_get_tree() to take - * care of the write barrier. - */ - smp_wmb(); - sb->s_flags |= SB_BORN; - /* We are done configuring the superblock, so unlock it */ - up_write(&sb->s_umount); + up_write(&fsc->root->d_sb->s_umount); /* Create the submount */ mnt = vfs_create_mount(fsc); @@ -376,12 +339,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) put_fs_context(fsc); return mnt; -out_put_sb: - /* - * Only jump here when fsc->root is NULL and sb is still locked - * (otherwise put_fs_context() will put the superblock) - */ - deactivate_locked_super(sb); out_put_fsc: put_fs_context(fsc); out: diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a0a8228979a2..c7aee88ce7aa 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1353,8 +1353,44 @@ int fuse_fill_super_submount(struct super_block *sb, return 0; } +/* Filesystem context private data holds the FUSE inode of the mount point */ static int fuse_get_tree_submount(struct fs_context *fsc) { + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + kfree(fm); + return PTR_ERR(sb); + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + fuse_conn_put(fc); + kfree(fm); + sb->s_fs_info = NULL; + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + return 0; } From 29e0e4df9d2bd1f7dd3c7293bf49e08a9d27e811 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:55 +0200 Subject: [PATCH 10/15] fuse: Switch to fc_mount() for submounts fc_mount() already handles the vfs_get_tree(), sb->s_umount unlocking and vfs_create_mount() sequence. Using it greatly simplifies fuse_dentry_automount(). Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b88e5785a3dd..361150b7e807 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -311,38 +311,21 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) struct fs_context *fsc; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); - int err; fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); - if (IS_ERR(fsc)) { - err = PTR_ERR(fsc); - goto out; - } + if (IS_ERR(fsc)) + return ERR_CAST(fsc); /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ fsc->fs_private = mp_fi; - err = vfs_get_tree(fsc); - if (err) - goto out_put_fsc; - - /* We are done configuring the superblock, so unlock it */ - up_write(&fsc->root->d_sb->s_umount); - /* Create the submount */ - mnt = vfs_create_mount(fsc); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out_put_fsc; - } - mntget(mnt); + mnt = fc_mount(fsc); + if (!IS_ERR(mnt)) + mntget(mnt); + put_fs_context(fsc); return mnt; - -out_put_fsc: - put_fs_context(fsc); -out: - return ERR_PTR(err); } const struct dentry_operations fuse_dentry_operations = { From 1b539917374d26fb64395eeb5d4baebd7ad38f61 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Jun 2021 18:11:56 +0200 Subject: [PATCH 11/15] fuse: Make fuse_fill_super_submount() static This function used to be called from fuse_dentry_automount(). This code was moved to fuse_get_tree_submount() in the same file since then. It is unlikely there will ever be another user. No need to be extern in this case. Signed-off-by: Greg Kurz Reviewed-by: Max Reitz Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 9 --------- fs/fuse/inode.c | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b5957c8274dd..de6489f0a145 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1084,15 +1084,6 @@ void fuse_send_init(struct fuse_mount *fm); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); -/* - * Fill in superblock for submounts - * @sb: partially-initialized superblock to fill in - * @parent_fi: The fuse_inode of the parent filesystem where this submount is - * mounted - */ -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi); - /* * Remove the mount from the connection * diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c7aee88ce7aa..04f25f49c37f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1315,8 +1315,8 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_xattr = fuse_no_acl_xattr_handlers; } -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi) +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct super_block *parent_sb = parent_fi->inode.i_sb; From 6b1bdb56b17c25f640261f3b18030cb0a21d7878 Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Wed, 12 May 2021 17:18:48 +0100 Subject: [PATCH 12/15] fuse: allow fallocate(FALLOC_FL_ZERO_RANGE) The current fuse module filters out fallocate(FALLOC_FL_ZERO_RANGE) returning -EOPNOTSUPP. libnbd's nbdfuse would like to translate FALLOC_FL_ZERO_RANGE requests into the NBD command NBD_CMD_WRITE_ZEROES which allows NBD servers that support it to do zeroing efficiently. This commit treats this flag exactly like FALLOC_FL_PUNCH_HOLE. A way to test this, requiring fuse >= 3, nbdkit >= 1.8 and the latest nbdfuse from https://gitlab.com/nbdkit/libnbd/-/tree/master/fuse is to create a file containing some data and "mirror" it to a fuse file: $ dd if=/dev/urandom of=disk.img bs=1M count=1 $ nbdkit file disk.img $ touch mirror.img $ nbdfuse mirror.img nbd://localhost & (mirror.img -> nbdfuse -> NBD over loopback -> nbdkit -> disk.img) You can then run commands such as: $ fallocate -z -o 1024 -l 1024 mirror.img and check that the content of the original file ("disk.img") stays synchronized. To show NBD commands, export LIBNBD_DEBUG=1 before running nbdfuse. To clean up: $ fusermount3 -u mirror.img $ killall nbdkit Signed-off-by: Richard W.M. Jones Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 09ef2a4d25ed..ec20a1801c1b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2907,11 +2907,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_PUNCH_HOLE); + (mode & (FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)); bool block_faults = FUSE_IS_DAX(inode) && lock_inode; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (fm->fc->no_fallocate) @@ -2926,7 +2928,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { loff_t endbyte = offset + length - 1; err = fuse_writeback_range(inode, offset, endbyte); @@ -2966,7 +2968,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, file_update_time(file); } - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); fuse_invalidate_attr(inode); From 15db16837a35d8007cb8563358787412213db25e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 21 Jun 2021 14:03:53 +0300 Subject: [PATCH 13/15] fuse: fix illegal access to inode with reused nodeid Server responds to LOOKUP and other ops (READDIRPLUS/CREATE/MKNOD/...) with ourarg containing nodeid and generation. If a fuse inode is found in inode cache with the same nodeid but different generation, the existing fuse inode should be unhashed and marked "bad" and a new inode with the new generation should be hashed instead. This can happen, for example, with passhrough fuse filesystem that returns the real filesystem ino/generation on lookup and where real inode numbers can get recycled due to real files being unlinked not via the fuse passthrough filesystem. With current code, this situation will not be detected and an old fuse dentry that used to point to an older generation real inode, can be used to access a completely new inode, which should be accessed only via the new dentry. Note that because the FORGET message carries the nodeid w/o generation, the server should wait to get FORGET counts for the nlookup counts of the old and reused inodes combined, before it can free the resources associated to that nodeid. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- fs/fuse/fuse_i.h | 7 +++++++ fs/fuse/inode.c | 4 ++-- fs/fuse/readdir.c | 7 +++++-- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 361150b7e807..eade6f965b2e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - inode_wrong_type(inode, outarg.attr.mode)) + fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index de6489f0a145..07829ce78695 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -870,6 +870,13 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline bool fuse_stale_inode(const struct inode *inode, int generation, + struct fuse_attr *attr) +{ + return inode->i_generation != generation || + inode_wrong_type(inode, attr->mode); +} + static inline void fuse_make_bad(struct inode *inode) { remove_inode_hash(inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 04f25f49c37f..b9beb39a4a18 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -350,8 +350,8 @@ retry: inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if (inode_wrong_type(inode, attr->mode)) { - /* Inode has changed type, any I/O on the old should fail */ + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); iput(inode); goto retry; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 277f7041d55a..bc267832310c 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -200,9 +200,12 @@ retry: if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); + if (inode && get_node_id(inode) != o->nodeid) + inode = NULL; if (!inode || - get_node_id(inode) != o->nodeid || - inode_wrong_type(inode, o->attr.mode)) { + fuse_stale_inode(inode, o->generation, &o->attr)) { + if (inode) + fuse_make_bad(inode); d_invalidate(dentry); dput(dentry); goto retry; From 6c88632be3827899953d9bc2260da378394007b7 Mon Sep 17 00:00:00 2001 From: Wu Bo Date: Tue, 25 May 2021 15:40:47 +0800 Subject: [PATCH 14/15] fuse: use DIV_ROUND_UP helper macro for calculations Replace open coded divisor calculations with the DIV_ROUND_UP kernel macro for better readability. Signed-off-by: Wu Bo Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec20a1801c1b..3f6ba68dfbbc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1405,7 +1405,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; ret += start; - npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + npages = DIV_ROUND_UP(ret, PAGE_SIZE); ap->descs[ap->num_pages].offset = start; fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); From c4e0cd4e0c16544ff0afecf07a5fe17de6077233 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 4 Jun 2021 09:46:17 +0800 Subject: [PATCH 15/15] virtiofs: Fix spelling mistakes Fix some spelling mistakes in comments: refernce ==> reference happnes ==> happens threhold ==> threshold splitted ==> split mached ==> matched Signed-off-by: Zheng Yongjun Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 6 +++--- fs/fuse/dev.c | 2 +- fs/fuse/file.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ff99ab2a3c43..a005859d1c1c 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -212,7 +212,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, dmap->writable = writable; if (!upgrade) { /* - * We don't take a refernce on inode. inode is valid right now + * We don't take a reference on inode. inode is valid right now * and when inode is going away, cleanup logic should first * cleanup dmap entries. */ @@ -621,7 +621,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, } /* - * If read beyond end of file happnes, fs code seems to return + * If read beyond end of file happens, fs code seems to return * it as hole */ iomap_hole: @@ -1206,7 +1206,7 @@ static void fuse_dax_free_mem_worker(struct work_struct *work) ret); } - /* If number of free ranges are still below threhold, requeue */ + /* If number of free ranges are still below threshold, requeue */ kick_dmap_free_worker(fcd, 1); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b8d58aa08206..1c8f79b3dd06 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -91,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) { /* * lockess check of fc->connected is okay, because atomic_dec_and_test() - * provides a memory barrier mached with the one in fuse_wait_aborted() + * provides a memory barrier matched with the one in fuse_wait_aborted() * to ensure no wake-up is missed. */ if (atomic_dec_and_test(&fc->num_waiting) && diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3f6ba68dfbbc..46ddc46b42c0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: - * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in