From fb5f51c7425ebc808d91329257cbc963e2421368 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:41 +1100 Subject: [PATCH 01/40] vfs: change d_manage() to take a struct path For the autofs module to be able to reliably check if a dentry is a mountpoint in a multiple namespace environment the ->d_manage() dentry operation will need to take a path argument instead of a dentry. Link: http://lkml.kernel.org/r/20161011053352.27645.83962.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- fs/autofs4/root.c | 5 +++-- fs/namei.c | 13 ++++++------- include/linux/dcache.h | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..4ca3a8d1349d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -20,7 +20,7 @@ prototypes: void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b5039a00caaf..3893f4d44cd4 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -948,7 +948,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a11f73174877..3a4328218b71 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *, struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(struct dentry *, bool); +static int autofs4_d_manage(const struct path *, bool); static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { @@ -421,8 +421,9 @@ done: return NULL; } -static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; diff --git a/fs/namei.c b/fs/namei.c index 5b4eed221530..81ac4736b596 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) break; } @@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(struct dentry *dentry) +static inline int managed_dentry_rcu(const struct path *path) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - dentry->d_op->d_manage(dentry, true) : 0; + return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + path->dentry->d_op->d_manage(path, true) : 0; } /* @@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path->dentry)) { + switch (managed_dentry_rcu(path)) { case -ECHILD: default: return false; @@ -1392,8 +1392,7 @@ int follow_down(struct path *path) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage( - path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5beed7b30561..bc529ce88ed0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -139,7 +139,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); } ____cacheline_aligned; From c6609c0a1c34fc097152b28b496236625673924f Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:41 +1100 Subject: [PATCH 02/40] vfs: add path_is_mountpoint() helper d_mountpoint() can only be used reliably to establish if a dentry is not mounted in any namespace. It isn't aware of the possibility there may be multiple mounts using a given dentry that may be in a different namespace. Add helper functions, path_is_mountpoint(), that checks if a struct path is a mountpoint for this case. Link: http://lkml.kernel.org/r/20161011053358.27645.9729.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/mount.h | 6 ++++++ fs/namespace.c | 29 +++++++++++++++++++++++++++++ include/linux/mount.h | 2 ++ 3 files changed, 37 insertions(+) diff --git a/fs/mount.h b/fs/mount.h index d2e25d7b64b3..2c856fc47ae3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -94,6 +94,12 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); +static inline bool __path_is_mountpoint(const struct path *path) +{ + struct mount *m = __lookup_mnt(path->mnt, path->dentry); + return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); +} + extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) diff --git a/fs/namespace.c b/fs/namespace.c index e6c234b1a645..7a73383e8365 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1159,6 +1159,35 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); +/* path_is_mountpoint() - Check if path is a mount in the current + * namespace. + * + * d_mountpoint() can only be used reliably to establish if a dentry is + * not mounted in any namespace and that common case is handled inline. + * d_mountpoint() isn't aware of the possibility there may be multiple + * mounts using a given dentry in a different namespace. This function + * checks if the passed in path is a mountpoint rather than the dentry + * alone. + */ +bool path_is_mountpoint(const struct path *path) +{ + unsigned seq; + bool res; + + if (!d_mountpoint(path->dentry)) + return false; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + res = __path_is_mountpoint(path); + } while (read_seqretry(&mount_lock, seq)); + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(path_is_mountpoint); + struct vfsmount *mnt_clone_internal(struct path *path) { struct mount *p; diff --git a/include/linux/mount.h b/include/linux/mount.h index 1172cce949a4..5b6dd004bfdc 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -98,4 +98,6 @@ extern dev_t name_to_dev_t(const char *name); extern unsigned int sysctl_mount_max; +extern bool path_is_mountpoint(const struct path *path); + #endif /* _LINUX_MOUNT_H */ From 01619491a5f0766014fe863c5ae480665436e7a2 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:41 +1100 Subject: [PATCH 03/40] vfs: add path_has_submounts() d_mountpoint() can only be used reliably to establish if a dentry is not mounted in any namespace. It isn't aware of the possibility there may be multiple mounts using the given dentry, possibly in a different namespace. Add function, path_has_submounts(), that checks is a struct path contains mounts (or is a mountpoint itself) to handle this case. Link: http://lkml.kernel.org/r/20161011053403.27645.55242.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/dcache.h | 1 + 2 files changed, 40 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 5c7cc953ac81..8515875854b6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1306,6 +1306,45 @@ int have_submounts(struct dentry *parent) } EXPORT_SYMBOL(have_submounts); +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; + +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) +{ + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (likely(!d_mountpoint(dentry))) + return D_WALK_CONTINUE; + if (__path_is_mountpoint(&path)) { + info->mounted = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. + * + * Return true if the parent or its subdirectories contain + * a mount point in the current namespace. + */ +int path_has_submounts(const struct path *parent) +{ + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; + + read_seqlock_excl(&mount_lock); + d_walk(parent->dentry, &data, path_check_mount, NULL); + read_sequnlock_excl(&mount_lock); + + return data.mounted; +} +EXPORT_SYMBOL(path_has_submounts); + /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bc529ce88ed0..0ffaf7aef9ae 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -255,6 +255,7 @@ extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int have_submounts(struct dentry *); +extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. From 74f504cff50b918f8ec2762b1513ae755da56a95 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:41 +1100 Subject: [PATCH 04/40] autofs: change autofs4_expire_wait()/do_expire_wait() to take struct path In order to use the functions path_is_mountpoint() and path_has_submounts() autofs needs to pass a struct path in several places. Start by changing autofs4_expire_wait() and do_expire_wait() to take a struct path instead of a struct dentry. Link: http://lkml.kernel.org/r/20161011053408.27645.40091.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/autofs4/autofs_i.h | 2 +- fs/autofs4/dev-ioctl.c | 2 +- fs/autofs4/expire.c | 3 ++- fs/autofs4/root.c | 12 +++++++----- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a1fba4285277..45311525ad89 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); +int autofs4_expire_wait(const struct path *path, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..40c69f91dc0a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry, 0); + autofs4_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index d8e6d421c27f..6ba6107e6102 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -495,8 +495,9 @@ found: return expired; } -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) +int autofs4_expire_wait(const struct path *path, int rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 3a4328218b71..6b06337ca9fc 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -286,22 +286,24 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) return status; } -static int do_expire_wait(struct dentry *dentry, bool rcu_walk) +static int do_expire_wait(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct dentry *expiring; expiring = autofs4_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(dentry, rcu_walk); + return autofs4_expire_wait(path, rcu_walk); else { + struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(expiring, 0); + autofs4_expire_wait(&this, 0); autofs4_del_expiring(expiring); dput(expiring); } @@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * and the directory was removed, so just go ahead and try * the mount. */ - status = do_expire_wait(dentry, 0); + status = do_expire_wait(path, 0); if (status && status != -EAGAIN) return NULL; @@ -438,7 +440,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) } /* Wait for pending expires */ - if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + if (do_expire_wait(path, rcu_walk) == -ECHILD) return -ECHILD; /* From dd36a882e7ade2c642f8711426ad8e4b7009aaae Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:42 +1100 Subject: [PATCH 05/40] autofs: change autofs4_wait() to take struct path In order to use the functions path_is_mountpoint() and path_has_submounts() autofs needs to pass a struct path in several places. Now change autofs4_wait() to take a struct path instead of a struct dentry. Link: http://lkml.kernel.org/r/20161011053413.27645.84666.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/autofs4/autofs_i.h | 3 ++- fs/autofs4/expire.c | 5 +++-- fs/autofs4/root.c | 16 ++++++++-------- fs/autofs4/waitq.c | 3 ++- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 45311525ad89..c885daae68c8 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait(struct autofs_sb_info *, + const struct path *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 6ba6107e6102..9c352da24444 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -526,7 +526,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, dentry, NFY_NONE); + status = autofs4_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -593,11 +593,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + ret = autofs4_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 6b06337ca9fc..0e9881552881 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -269,17 +269,17 @@ next: return NULL; } -static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) +static int autofs4_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - pr_debug("waiting for mount name=%pd\n", dentry); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; @@ -364,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -405,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -447,7 +447,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(dentry, rcu_walk); + status = autofs4_mount_wait(path, rcu_walk); if (status) return status; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..38ef973f80e9 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -345,8 +345,9 @@ static int validate_request(struct autofs_wait_queue **wait, } int autofs4_wait(struct autofs_sb_info *sbi, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; From cfaf86ab6ccdf0acf47ebe474a4a844114bc0e10 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:42 +1100 Subject: [PATCH 06/40] autofs: use path_is_mountpoint() to fix unreliable d_mountpoint() checks If an automount mount is clone(2)ed into a file system that is propagation private, when it later expires in the originating namespace, subsequent calls to autofs ->d_automount() for that dentry in the original namespace will return ELOOP until the mount is umounted in the cloned namespace. Now that a struct path is available where needed use path_is_mountpoint() instead of d_mountpoint() so we don't get false positives when checking if a dentry is a mount point in the current namespace. Link: http://lkml.kernel.org/r/20161011053418.27645.15241.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/autofs4/root.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 0e9881552881..9355608cb495 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!d_mountpoint(dentry) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -372,15 +372,15 @@ static struct vfsmount *autofs4_d_automount(struct path *path) /* * If the dentry is a symlink it's equivalent to a directory - * having d_mountpoint() true, so there's no need to call back - * to the daemon. + * having path_is_mountpoint() true, so there's no need to call + * back to the daemon. */ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } - if (!d_mountpoint(dentry)) { + if (!path_is_mountpoint(path)) { /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it @@ -434,7 +434,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { - if (!d_mountpoint(dentry)) + if (!path_is_mountpoint(path)) return -EISDIR; return 0; } @@ -463,7 +463,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; - if (d_mountpoint(dentry)) + if (path_is_mountpoint(path)) return 0; inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) @@ -490,7 +490,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } From 60359741473438f66fef7297d1fa2435640dbe79 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:42 +1100 Subject: [PATCH 07/40] autofs: use path_has_submounts() to fix unreliable have_submount() checks If an automount mount is clone(2)ed into a file system that is propagation private, when it later expires in the originating namespace, subsequent calls to autofs ->d_automount() for that dentry in the original namespace will return ELOOP until the mount is umounted in the cloned namespace. Now that a struct path is available where needed use path_has_submounts() instead of have_submounts() so we don't get false positives when checking if a dentry is a mount point or contains mounts in the current namespace. Link: http://lkml.kernel.org/r/20161011053423.27645.91233.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 2 +- fs/autofs4/root.c | 14 +++++++------- fs/autofs4/waitq.c | 10 +++++++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 40c69f91dc0a..afacdaa8dd5a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, devid = new_encode_dev(dev); - err = have_submounts(path.dentry); + err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 9355608cb495..ce4e6216a5f0 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -384,16 +384,16 @@ static struct vfsmount *autofs4_d_automount(struct path *path) /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it - * should. For v5 have_submounts() is sufficient to handle - * this because the leaves of the directory tree under the - * mount never trigger mounts themselves (they have an autofs - * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to trigger mounts. In this case we - * have no choice but to use the list_empty() check and + * should. For v5 path_has_submounts() is sufficient to + * handle this because the leaves of the directory tree under + * the mount never trigger mounts themselves (they have an + * autofs trigger mount mounted on them). But v4 pseudo direct + * mounts do need the leaves to trigger mounts. In this case + * we have no choice but to use the list_empty() check and * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) { + if (path_has_submounts(path)) { spin_unlock(&sbi->fs_lock); goto done; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 38ef973f80e9..1278335ce366 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; + struct path this; int valid = 1; /* @@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait, dentry = new; } } - if (have_submounts(dentry)) + this.mnt = path->mnt; + this.dentry = dentry; + if (path_has_submounts(&this)) valid = 0; if (new) @@ -406,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -EINTR; } - ret = validate_request(&wq, sbi, &qstr, dentry, notify); + ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); From f74e7b33c37e5a7bae33bb73858c2766cb256626 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 24 Nov 2016 08:03:42 +1100 Subject: [PATCH 08/40] vfs: remove unused have_submounts() function Now that path_has_submounts() has been added have_submounts() is no longer used so remove it. Link: http://lkml.kernel.org/r/20161011053428.27645.12310.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 33 --------------------------------- include/linux/dcache.h | 1 - 2 files changed, 34 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 8515875854b6..252378359a8f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1273,39 +1273,6 @@ rename_retry: goto again; } -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ - -static enum d_walk_ret check_mount(void *data, struct dentry *dentry) -{ - int *ret = data; - if (d_mountpoint(dentry)) { - *ret = 1; - return D_WALK_QUIT; - } - return D_WALK_CONTINUE; -} - -/** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. - * - * Return true if the parent or its subdirectories contain - * a mount point - */ -int have_submounts(struct dentry *parent) -{ - int ret = 0; - - d_walk(parent, &ret, check_mount, NULL); - - return ret; -} -EXPORT_SYMBOL(have_submounts); - struct check_mount { struct vfsmount *mnt; unsigned int mounted; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 0ffaf7aef9ae..c965e4469499 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -254,7 +254,6 @@ extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ -extern int have_submounts(struct dentry *); extern int path_has_submounts(const struct path *); /* From 455e8f1030de82b68ee4e82f71516f3692f5e626 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 28 Nov 2016 10:11:59 +0800 Subject: [PATCH 09/40] autofs - constify misc struct path instances Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Al Viro --- fs/autofs4/expire.c | 2 +- fs/autofs4/root.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 9c352da24444..13178bf2c431 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -593,7 +593,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); - struct path path = { .mnt = mnt, .dentry = dentry }; + const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index ce4e6216a5f0..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -297,7 +297,7 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) if (!expiring) return autofs4_expire_wait(path, rcu_walk); else { - struct path this = { .mnt = path->mnt, .dentry = expiring }; + const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed From 1c4344a50d702307185cb98fb67bff938cd66aa0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Mon, 28 Nov 2016 10:12:14 +0800 Subject: [PATCH 10/40] autofs - dont hold spin lock over direct mount expire Commit 7cbdb4a286 altered the autofs indirect mount expire to not hold a spin lock during the expire check. The direct mount expire needs the same treatment because to make autofs expires namespace aware may_umount_tree() needs to to use a similar method to may_umount() when checking if a mount tree is in use. This means may_umount_tree() will end up taking the namespace_sem for the check so the autofs direct mount expire won't be allowed to hold a spin lock over the check. Signed-off-by: Ian Kent Cc: Al Viro Cc: Eric W. Biederman Cc: Omar Sandoval Signed-off-by: Al Viro --- fs/autofs4/expire.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 13178bf2c431..57725d4a8c59 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + goto out; + } ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); } out: - spin_unlock(&sbi->fs_lock); dput(root); return NULL; From 640eb7e7b5242af53c456552a526d0080e6333f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 14 Nov 2016 22:14:35 +0100 Subject: [PATCH 11/40] fs: Constify path_is_under()'s arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function path_is_under() doesn't modify the paths pointed by its arguments but only browse them. Constifying this pointers make a cleaner interface to be used by (future) code which may only have access to const struct path pointers (e.g. LSM hooks). Signed-off-by: Mickaël Salaün Cc: Alexander Viro Signed-off-by: Al Viro --- fs/namespace.c | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index e6c234b1a645..4d80a5066a1f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2997,7 +2997,7 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -bool path_is_under(struct path *path1, struct path *path2) +bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index dc0478c07b2a..f96501b51c49 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2709,7 +2709,7 @@ extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); -extern bool path_is_under(struct path *, struct path *); +extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); From 3cd5eca8d7a2fe43098df4c33a1272fe6945cac9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:19:09 -0500 Subject: [PATCH 12/40] fsnotify: constify 'data' passed to ->handle_event() Signed-off-by: Al Viro --- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify.c | 8 ++++---- fs/notify/fanotify/fanotify.h | 2 +- fs/notify/inotify/inotify.h | 2 +- fs/notify/inotify/inotify_fsnotify.c | 4 ++-- include/linux/fsnotify_backend.h | 2 +- kernel/audit_fsnotify.c | 10 +++++----- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 8 ++++---- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6faaf710e563..5a4ec309e283 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c99f..bbc175d4213d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group, static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, u32 event_mask, - void *data, int data_type) + const void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; - struct path *path = data; + const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, @@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path) + const struct path *path) { struct fanotify_event_info *event; @@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { int ret = 0; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 2a5fb14115df..4500a74f8d38 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path); + const struct path *path); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..a6f5907a3fee 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2cd900c2c737..19e7ec109a75 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; @@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group, if ((inode_mark->mask & FS_EXCL_UNLINK) && (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; + const struct path *path = data; if (d_unlinked(path->dentry)) return 0; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 79467b239fcf..d357041bbec8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -96,7 +96,7 @@ struct fsnotify_ops { struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f84f8d06e1f6..1173f7cc7ba3 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_ } static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, - struct inode *inode) + const struct inode *inode) { audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; @@ -168,11 +168,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { struct audit_fsnotify_mark *audit_mark; - struct inode *inode = NULL; + const struct inode *inode = NULL; audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); @@ -180,10 +180,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = ((const struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 25772476fa4a..3a2f5dfe8093 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -948,7 +948,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..f476c46b9c20 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -472,10 +472,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { - struct inode *inode; + const struct inode *inode; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); @@ -484,10 +484,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((struct path *)data)->dentry); + inode = d_backing_inode(((const struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); From e637835eccc8b93f39ca869628f9a0437bba744e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:21:17 -0500 Subject: [PATCH 13/40] fsnotify(): constify 'data' Signed-off-by: Al Viro --- fs/notify/fsnotify.c | 6 +++--- include/linux/fsnotify_backend.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index db39de2dd4cb..7788a79eedf7 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -125,7 +125,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent); static int send_to_group(struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, + __u32 mask, const void *data, int data_is, u32 cookie, const unsigned char *file_name) { @@ -187,7 +187,7 @@ static int send_to_group(struct inode *to_tell, * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, +int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; @@ -199,7 +199,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (data_is == FSNOTIFY_EVENT_PATH) - mnt = real_mount(((struct path *)data)->mnt); + mnt = real_mount(((const struct path *)data)->mnt); else mnt = NULL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d357041bbec8..e6ea6757a275 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -245,7 +245,7 @@ struct fsnotify_mark { /* called from the vfs helpers */ /* main fsnotify call to send events */ -extern int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, +extern int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *name, u32 cookie); extern int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask); extern void __fsnotify_inode_delete(struct inode *inode); @@ -357,7 +357,7 @@ extern void fsnotify_init_event(struct fsnotify_event *event, #else -static inline int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, +static inline int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *name, u32 cookie) { return 0; From 12c7f9dc0fd154632457f3474351bcfcf4e61512 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:23:04 -0500 Subject: [PATCH 14/40] constify fsnotify_parent() Signed-off-by: Al Viro --- fs/notify/fsnotify.c | 2 +- include/linux/fsnotify.h | 2 +- include/linux/fsnotify_backend.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7788a79eedf7..b41515d3f081 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -86,7 +86,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index b8bcc058e031..e19eb1f5e958 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,7 +17,7 @@ #include /* Notify this dentry's parent about a child's events. */ -static inline int fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +static inline int fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) { if (!dentry) dentry = path->dentry; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e6ea6757a275..0cf34d6cc253 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -247,7 +247,7 @@ struct fsnotify_mark { /* main fsnotify call to send events */ extern int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *name, u32 cookie); -extern int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask); +extern int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern u32 fsnotify_get_cookie(void); @@ -363,7 +363,7 @@ static inline int fsnotify(struct inode *to_tell, __u32 mask, const void *data, return 0; } -static inline int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +static inline int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) { return 0; } From 40212d531d4bfac48dca8cd3d794639766745cda Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:24:41 -0500 Subject: [PATCH 15/40] fsnotify: constify the places working with ->f_path Signed-off-by: Al Viro --- include/linux/fsnotify.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index e19eb1f5e958..b43d3f5bd9ea 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -28,7 +28,7 @@ static inline int fsnotify_parent(const struct path *path, struct dentry *dentry /* simple call site for access decisions */ static inline int fsnotify_perm(struct file *file, int mask) { - struct path *path = &file->f_path; + const struct path *path = &file->f_path; /* * Do not use file_inode() here or anywhere in this file to get the * inode. That would break *notity on overlayfs. @@ -176,7 +176,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry) */ static inline void fsnotify_access(struct file *file) { - struct path *path = &file->f_path; + const struct path *path = &file->f_path; struct inode *inode = path->dentry->d_inode; __u32 mask = FS_ACCESS; @@ -194,7 +194,7 @@ static inline void fsnotify_access(struct file *file) */ static inline void fsnotify_modify(struct file *file) { - struct path *path = &file->f_path; + const struct path *path = &file->f_path; struct inode *inode = path->dentry->d_inode; __u32 mask = FS_MODIFY; @@ -212,7 +212,7 @@ static inline void fsnotify_modify(struct file *file) */ static inline void fsnotify_open(struct file *file) { - struct path *path = &file->f_path; + const struct path *path = &file->f_path; struct inode *inode = path->dentry->d_inode; __u32 mask = FS_OPEN; @@ -228,7 +228,7 @@ static inline void fsnotify_open(struct file *file) */ static inline void fsnotify_close(struct file *file) { - struct path *path = &file->f_path; + const struct path *path = &file->f_path; struct inode *inode = path->dentry->d_inode; fmode_t mode = file->f_mode; __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE; From 8bd107633b64195a0748b05236c3d14db0a8bed4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:36:51 -0500 Subject: [PATCH 16/40] audit_log_{name,link_denied}: constify struct path Signed-off-by: Al Viro --- include/linux/audit.h | 2 +- kernel/audit.c | 4 ++-- kernel/audit.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d4443f93db6..f51fca8d0b6f 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -147,7 +147,7 @@ extern void audit_log_d_path(struct audit_buffer *ab, extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_link_denied(const char *operation, - struct path *link); + const struct path *link); extern void audit_log_lost(const char *message); #ifdef CONFIG_SECURITY extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); diff --git a/kernel/audit.c b/kernel/audit.c index f1ca11613379..06008c422bd5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1760,7 +1760,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, * @call_panic: optional pointer to int that will be updated if secid fails */ void audit_log_name(struct audit_context *context, struct audit_names *n, - struct path *path, int record_num, int *call_panic) + const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); @@ -1948,7 +1948,7 @@ EXPORT_SYMBOL(audit_log_task_info); * @operation: specific link operation * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, struct path *link) +void audit_log_link_denied(const char *operation, const struct path *link) { struct audit_buffer *ab; struct audit_names *name; diff --git a/kernel/audit.h b/kernel/audit.h index 431444c3708b..960d49c9db5e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name, extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, - struct audit_names *n, struct path *path, + struct audit_names *n, const struct path *path, int record_num, int *call_panic); extern int audit_pid; From 71215a75ceddf38ba9d4563481da8dd943de10fc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:30:18 -0500 Subject: [PATCH 17/40] constify get_dcookie() and friends Signed-off-by: Al Viro --- arch/powerpc/oprofile/cell/spu_task_sync.c | 2 +- drivers/oprofile/buffer_sync.c | 2 +- fs/dcookies.c | 4 ++-- include/linux/dcookies.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 83d2b4ef7f0d..44d67b167e0b 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c @@ -295,7 +295,7 @@ out: * dcookie user still being registered (namely, the reader * of the event buffer). */ -static inline unsigned long fast_get_dcookie(struct path *path) +static inline unsigned long fast_get_dcookie(const struct path *path) { unsigned long cookie; diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 82f7000a285d..642478d35e99 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -206,7 +206,7 @@ void sync_stop(void) * because we cannot reach this code without at least one * dcookie user still being registered (namely, the reader * of the event buffer). */ -static inline unsigned long fast_get_dcookie(struct path *path) +static inline unsigned long fast_get_dcookie(const struct path *path) { unsigned long cookie; diff --git a/fs/dcookies.c b/fs/dcookies.c index ac44a69fbea9..a26a701ef512 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -90,7 +90,7 @@ static void hash_dcookie(struct dcookie_struct * dcs) } -static struct dcookie_struct *alloc_dcookie(struct path *path) +static struct dcookie_struct *alloc_dcookie(const struct path *path) { struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, GFP_KERNEL); @@ -113,7 +113,7 @@ static struct dcookie_struct *alloc_dcookie(struct path *path) /* This is the main kernel-side routine that retrieves the cookie * value for a dentry/vfsmnt pair. */ -int get_dcookie(struct path *path, unsigned long *cookie) +int get_dcookie(const struct path *path, unsigned long *cookie) { int err = 0; struct dcookie_struct * dcs; diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h index 5ac3bdd5cee6..699b6c499c4f 100644 --- a/include/linux/dcookies.h +++ b/include/linux/dcookies.h @@ -44,7 +44,7 @@ void dcookie_unregister(struct dcookie_user * user); * * Returns 0 on success, with *cookie filled in */ -int get_dcookie(struct path *path, unsigned long *cookie); +int get_dcookie(const struct path *path, unsigned long *cookie); #else @@ -58,7 +58,7 @@ static inline void dcookie_unregister(struct dcookie_user * user) return; } -static inline int get_dcookie(struct path *path, unsigned long *cookie) +static inline int get_dcookie(const struct path *path, unsigned long *cookie) { return -ENOSYS; } From 5b5577e4ebbd63de1a774963a373d851857e980a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:33:32 -0500 Subject: [PATCH 18/40] autofs: constify find_autofs_mount() callback Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..dfc6f49ee597 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -204,7 +204,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, - int test(struct path *path, void *data), + int test(const struct path *path, void *data), void *data) { struct path path; @@ -230,12 +230,12 @@ static int find_autofs_mount(const char *pathname, return err; } -static int test_by_dev(struct path *path, void *p) +static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } -static int test_by_type(struct path *path, void *p) +static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs4_dentry_ino(path->dentry); From 92872094a163b6b6954c46b3d1e36ab9c8b2b32c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:34:31 -0500 Subject: [PATCH 19/40] constify btrfs_mksubvol() Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7acbd2cf6192..8270f4338b35 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -836,7 +836,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(struct path *parent, +static noinline int btrfs_mksubvol(const struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, From a4141d7cf80fee99ace8d8a95dd358c98ad6ad69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:28:12 -0500 Subject: [PATCH 20/40] constify alloc_file() Signed-off-by: Al Viro --- fs/file_table.c | 2 +- include/linux/file.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index ad17e05ebf95..6d982b57de92 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -155,7 +155,7 @@ over: * @mode: the mode with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -struct file *alloc_file(struct path *path, fmode_t mode, +struct file *alloc_file(const struct path *path, fmode_t mode, const struct file_operations *fop) { struct file *file; diff --git a/include/linux/file.h b/include/linux/file.h index 7444f5feda12..61eb82cbafba 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -17,7 +17,7 @@ struct file_operations; struct vfsmount; struct dentry; struct path; -extern struct file *alloc_file(struct path *, fmode_t mode, +extern struct file *alloc_file(const struct path *, fmode_t mode, const struct file_operations *fop); static inline void fput_light(struct file *file, int fput_needed) From 8c54ca9c6882f5a68d19a82fd063b74f91d4c22b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:49:34 -0500 Subject: [PATCH 21/40] quota: constify struct path in quota_on Signed-off-by: Al Viro --- fs/ext4/super.c | 4 ++-- fs/quota/dquot.c | 2 +- fs/quota/quota.c | 4 ++-- fs/reiserfs/super.c | 4 ++-- include/linux/quota.h | 2 +- include/linux/quotaops.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 52b0530c5d65..2d97f7a29d09 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1187,7 +1187,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); + const struct path *path); static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, @@ -5239,7 +5239,7 @@ static void lockdep_set_quota_inode(struct inode *inode, int subclass) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1bfac28b7e7d..8738a0d62c09 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2401,7 +2401,7 @@ int dquot_resume(struct super_block *sb, int type) EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int error = security_quota_on(path->dentry); if (error) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2d445425aad7..5acd0c4769af 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -80,7 +80,7 @@ unsigned int qtype_enforce_flag(int type) } static int quota_quotaon(struct super_block *sb, int type, qid_t id, - struct path *path) + const struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; @@ -700,7 +700,7 @@ static int quota_rmxquota(struct super_block *sb, void __user *addr) /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr, struct path *path) + void __user *addr, const struct path *path) { int ret; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0a6ad4e71e88..e314cb30a181 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -802,7 +802,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, struct path *); +static int reiserfs_quota_on(struct super_block *, int, int, const struct path *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -2348,7 +2348,7 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; struct inode *inode; diff --git a/include/linux/quota.h b/include/linux/quota.h index 55107a8ff887..78a98821f9d0 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -431,7 +431,7 @@ struct qc_info { /* Operations handling requests from userspace */ struct quotactl_ops { - int (*quota_on)(struct super_block *, int, int, struct path *); + int (*quota_on)(struct super_block *, int, int, const struct path *); int (*quota_off)(struct super_block *, int); int (*quota_enable)(struct super_block *, unsigned int); int (*quota_disable)(struct super_block *, unsigned int); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index f00fa86ac966..799a63d0e1a8 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -90,7 +90,7 @@ int dquot_file_open(struct inode *inode, struct file *file); int dquot_enable(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); + const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); From ca71cf71eeda04dc9ad18271504e499013af5415 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:45:28 -0500 Subject: [PATCH 22/40] namespace.c: constify struct path passed to a bunch of primitives Signed-off-by: Al Viro --- fs/internal.h | 2 +- fs/namespace.c | 8 ++++---- include/linux/fs.h | 2 +- include/linux/mount.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a3..3e460159d835 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -62,7 +62,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); -extern struct vfsmount *lookup_mnt(struct path *); +extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); diff --git a/fs/namespace.c b/fs/namespace.c index 4d80a5066a1f..9ad88a45b3e3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -678,7 +678,7 @@ out: * * lookup_mnt takes a reference to the found vfsmount. */ -struct vfsmount *lookup_mnt(struct path *path) +struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; @@ -1159,7 +1159,7 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -struct vfsmount *mnt_clone_internal(struct path *path) +struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); @@ -1758,7 +1758,7 @@ out: /* Caller should check returned pointer for errors */ -struct vfsmount *collect_mounts(struct path *path) +struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); @@ -1791,7 +1791,7 @@ void drop_collected_mounts(struct vfsmount *mnt) * * Release with mntput(). */ -struct vfsmount *clone_private_mount(struct path *path) +struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; diff --git a/include/linux/fs.h b/include/linux/fs.h index f96501b51c49..3056fe46f336 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2123,7 +2123,7 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(struct path *); +extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); diff --git a/include/linux/mount.h b/include/linux/mount.h index 1172cce949a4..cf2b5784b649 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -79,12 +79,12 @@ extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); -extern struct vfsmount *mnt_clone_internal(struct path *path); +extern struct vfsmount *mnt_clone_internal(const struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); struct path; -extern struct vfsmount *clone_private_mount(struct path *path); +extern struct vfsmount *clone_private_mount(const struct path *path); struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, From f0bb5aaf2c51267c49ed5e2c6103df22acfe30f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:27:12 -0500 Subject: [PATCH 23/40] vfs: misc struct path constification Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- fs/statfs.c | 2 +- fs/utimes.c | 2 +- include/linux/fs.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 5b4eed221530..1c8f4386b03f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2895,7 +2895,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct path *path, int acc_mode, int flag) +static int may_open(const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2945,7 +2945,7 @@ static int may_open(struct path *path, int acc_mode, int flag) static int handle_truncate(struct file *filp) { - struct path *path = &filp->f_path; + const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) diff --git a/fs/statfs.c b/fs/statfs.c index 083dc0ac9140..13ae259d4879 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -63,7 +63,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) return retval; } -int vfs_statfs(struct path *path, struct kstatfs *buf) +int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; diff --git a/fs/utimes.c b/fs/utimes.c index 22307cdf7014..5fdb505e307c 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -48,7 +48,7 @@ static bool nsec_valid(long nsec) return nsec >= 0 && nsec <= 999999999; } -static int utimes_common(struct path *path, struct timespec *times) +static int utimes_common(const struct path *path, struct timespec *times) { int error; struct iattr newattrs; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3056fe46f336..0e177d395efb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2127,7 +2127,7 @@ extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); -extern int vfs_statfs(struct path *, struct kstatfs *); +extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); extern int vfs_ustat(dev_t, struct kstatfs *); From a76b5b04375f974579c83433b06466758c0c552c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Dec 2016 16:17:19 -0800 Subject: [PATCH 24/40] fs: try to clone files first in vfs_copy_file_range A clone is a perfectly fine implementation of a file copy, so most file systems just implement the copy that way. Instead of duplicating this logic move it to the VFS. Currently btrfs and XFS implement copies the same way as clones and there is no behavior change for them, cifs only implements clones and grow support for copy_file_range with this patch. NFS implements both, so this will allow copy_file_range to work on servers that only implement CLONE and be lot more efficient on servers that implements CLONE and COPY. Signed-off-by: Christoph Hellwig --- fs/btrfs/ctree.h | 3 --- fs/btrfs/file.c | 1 - fs/btrfs/ioctl.c | 12 ------------ fs/read_write.c | 27 ++++++++++++++++++++++----- fs/xfs/xfs_file.c | 19 ------------------- 5 files changed, 22 insertions(+), 40 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b8ce2b9f7d0..05f75a949af4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3232,9 +3232,6 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3a14c87d9c92..991cc991fd29 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2998,7 +2998,6 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, .dedupe_file_range = btrfs_dedupe_file_range, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7acbd2cf6192..dab746298758 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3980,18 +3980,6 @@ out_unlock: return ret; } -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - ssize_t ret; - - ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); - if (ret == 0) - ret = len; - return ret; -} - int btrfs_clone_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, u64 len) { diff --git a/fs/read_write.c b/fs/read_write.c index 190e0d362581..6674a4b83c54 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1542,20 +1542,37 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (ret) return ret; - ret = -EOPNOTSUPP; - if (file_out->f_op->copy_file_range) + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - if (ret == -EOPNOTSUPP) - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + if (ret != -EOPNOTSUPP) + goto done; + } + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + +done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } + inc_syscr(current); inc_syscw(current); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea..86ecc9b49e15 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -909,24 +909,6 @@ out_unlock: return error; } -STATIC ssize_t -xfs_file_copy_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - size_t len, - unsigned int flags) -{ - int error; - - error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); - if (error) - return error; - return len; -} - STATIC int xfs_file_clone_range( struct file *file_in, @@ -1625,7 +1607,6 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .copy_file_range = xfs_file_copy_range, .clone_file_range = xfs_file_clone_range, .dedupe_file_range = xfs_file_dedupe_range, }; From 876bec6f9bbfcb394916d17e35226b086c04dc45 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 9 Dec 2016 16:18:30 -0800 Subject: [PATCH 25/40] vfs: refactor clone/dedupe_file_range common functions Hoist both the XFS reflink inode state and preparation code and the XFS file blocks compare functions into the VFS so that ocfs2 can take advantage of it for reflink and dedupe. Signed-off-by: Darrick J. Wong --- fs/read_write.c | 204 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.c | 213 ++----------------------------------------- include/linux/fs.h | 6 ++ 3 files changed, 219 insertions(+), 204 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 6674a4b83c54..dbf3f7ffdf3f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1667,6 +1667,114 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + */ +int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, + struct inode *inode_out, loff_t pos_out, + u64 *len, bool is_dedupe) +{ + loff_t bs = inode_out->i_sb->s_blocksize; + loff_t blen; + loff_t isize; + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + *len = 0; + return 0; + } + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + if (is_dedupe) { + *len = 0; + return 0; + } + *len = isize - pos_in; + } + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + *len < pos_in || pos_out + *len < pos_out || + pos_in + *len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + *len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + *len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = *len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (is_dedupe) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + return 0; +} +EXPORT_SYMBOL(vfs_clone_file_prep_inodes); + int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { @@ -1718,6 +1826,102 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, } EXPORT_SYMBOL(vfs_clone_file_range); +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_compare); + int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a279b4e7f5fe..95d6828967f0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1164,111 +1164,6 @@ err: return error; } -/* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page * -xfs_get_page( - struct inode *inode, - xfs_off_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - */ -static int -xfs_compare_extents( - struct inode *src, - xfs_off_t srcoff, - struct inode *dest, - xfs_off_t destoff, - xfs_off_t len, - bool *is_same) -{ - xfs_off_t src_poff; - xfs_off_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - xfs_off_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - ASSERT(cmp_len > 0); - - trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, - XFS_I(dest), destoff); - - src_page = xfs_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = xfs_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); - return error; -} - /* * Link a range of blocks from one file to another. */ @@ -1286,14 +1181,11 @@ xfs_reflink_remap_range( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); struct xfs_mount *mp = src->i_mount; - loff_t bs = inode_out->i_sb->s_blocksize; bool same_inode = (inode_in == inode_out); xfs_fileoff_t sfsbno, dfsbno; xfs_filblks_t fsblen; xfs_extlen_t cowextsize; - loff_t isize; ssize_t ret; - loff_t blen; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1310,26 +1202,8 @@ xfs_reflink_remap_range( xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); } - /* Don't touch certain kinds of inodes */ - ret = -EPERM; - if (IS_IMMUTABLE(inode_out)) - goto out_unlock; - - ret = -ETXTBSY; - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - goto out_unlock; - - - /* Don't reflink dirs, pipes, sockets... */ - ret = -EISDIR; - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - goto out_unlock; + /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - goto out_unlock; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - goto out_unlock; - /* Don't reflink realtime inodes */ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; @@ -1338,91 +1212,18 @@ xfs_reflink_remap_range( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) { - ret = 0; - goto out_unlock; - } - - if (len == 0) - len = isize - pos_in; - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - goto out_unlock; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - goto out_unlock; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - goto out_unlock; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - goto out_unlock; - } - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + len - 1); - if (ret) - goto out_unlock; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + len - 1); - if (ret) + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret || len == 0) goto out_unlock; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Check that the extents are the same. - */ - if (is_dedupe) { - bool is_same = false; - - ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, - len, &is_same); - if (ret) - goto out_unlock; - if (!is_same) { - ret = -EBADE; - goto out_unlock; - } - } - + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; - /* - * Invalidate the page cache so that we can clear any CoW mappings - * in the destination file. - */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1431,6 +1232,10 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file diff --git a/include/linux/fs.h b/include/linux/fs.h index dc0478c07b2a..caea736fa09c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1778,8 +1778,14 @@ extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *, int); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); +extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, + struct inode *inode_out, loff_t pos_out, + u64 *len, bool is_dedupe); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); +extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); From c0cf3ef5e0f47e385920450b245d22bead93e7ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2016 21:42:32 -0400 Subject: [PATCH 26/40] nfs_write_end(): fix handling of short copies What matters when deciding if we should make a page uptodate is not how much we _wanted_ to copy, but how much we actually have copied. As it is, on architectures that do not zero tail on short copy we can leave uninitialized data in page marked uptodate. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9ea85ae23c32..a1de8ef63e56 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -374,7 +374,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, */ if (!PageUptodate(page)) { unsigned pglen = nfs_page_length(page); - unsigned end = offset + len; + unsigned end = offset + copied; if (pglen == 0) { zero_user_segments(page, 0, offset, From b9de313cf05fe08fa59efaf19756ec5283af672a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2016 22:20:03 -0400 Subject: [PATCH 27/40] fix ceph_write_end() don't zero on short copies; if the page was uptodate it's just plain wrong, and if it wasn't we'll be better off just returning 0 and buggering off. Signed-off-by: Al Viro --- fs/ceph/addr.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ef3ebd780aff..834be0943a26 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1276,25 +1276,27 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - unsigned from = pos & (PAGE_SIZE - 1); int check_cap = 0; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); /* zero the stale part of the page if we did a short copy */ - if (copied < len) - zero_user_segment(page, from+copied, len); + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + SetPageUptodate(page); + } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); - if (!PageUptodate(page)) - SetPageUptodate(page); - set_page_dirty(page); +out: unlock_page(page); put_page(page); From 43388b21e72d36204822bcc3119e42abe6ebceef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Sep 2016 22:06:35 -0400 Subject: [PATCH 28/40] fix gfs2_stuffed_write_end() on short copies a) the page is uptodate - ->write_begin() would either fail (in which case we don't reach ->write_end()), or unstuff the inode, or find the page already uptodate, or do a successful call of stuffed_readpage(), which would've made it uptodate b) zeroing the tail in pagecache is wrong. kill -9 at the right time while writing unmodified file contents to the same file should _not_ leave us in a situation when read() from the file will be reporting it full of zeroes. Especially since that effect will be transient - at some later point the page will be evicted and then we'll be back to the real file contents. Signed-off-by: Al Viro --- fs/gfs2/aops.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5a6f52ea2722..6b039d7ce160 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -839,12 +839,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); - memset(kaddr + pos + copied, 0, len - copied); flush_dcache_page(page); kunmap_atomic(kaddr); - if (!PageUptodate(page)) - SetPageUptodate(page); + WARN_ON(!PageUptodate(page)); unlock_page(page); put_page(page); From 77469c3f570e329acb631c5c03780eacdca2a534 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 29 Aug 2016 20:56:35 -0400 Subject: [PATCH 29/40] 9p: saner ->write_end() on failing copy into non-uptodate page If we had a short copy into an uptodate page, there's no reason whatsoever to zero anything; OTOH, if that page had _not_ been uptodate, we must have been trying to overwrite it completely and got a short copy. In that case, overwriting the end with zeroes, marking uptodate and sending to server is just plain wrong. Just unlock, keep it non-uptodate and return 0. Signed-off-by: Al Viro --- fs/9p/vfs_addr.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6181ad79e1a5..ff8ece89fb99 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -309,18 +309,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (unlikely(copied < len)) { - /* - * zero out the rest of the area - */ - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user(page, from + copied, len - copied); - flush_dcache_page(page); + if (unlikely(copied < len && !PageUptodate(page))) { + copied = 0; + goto out; } - - if (!PageUptodate(page)) - SetPageUptodate(page); /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. @@ -330,6 +322,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, i_size_write(inode, last_pos); } set_page_dirty(page); +out: unlock_page(page); put_page(page); From 92e50d2d42d7cb9fcd816b47b580622032d38293 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 29 Aug 2016 22:04:51 -0400 Subject: [PATCH 30/40] exofs: don't mess with simple_write_{begin,end} ... and don't zero anything on short copy; just unlock and return 0 if that has happened on non-uptodate page. Signed-off-by: Al Viro --- fs/exofs/inode.c | 68 +++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d8072bc074a4..0ac62811b341 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -870,46 +870,31 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { - ret = simple_write_begin(file, mapping, pos, len, flags, pagep, - fsdata); - if (ret) { - EXOFS_DBGMSG("simple_write_begin failed\n"); - goto out; + page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT, + flags); + if (!page) { + EXOFS_DBGMSG("grab_cache_page_write_begin failed\n"); + return -ENOMEM; } - - page = *pagep; + *pagep = page; } /* read modify write */ if (!PageUptodate(page) && (len != PAGE_SIZE)) { loff_t i_size = i_size_read(mapping->host); pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t rlen; - if (page->index < end_index) - rlen = PAGE_SIZE; - else if (page->index == end_index) - rlen = i_size & ~PAGE_MASK; - else - rlen = 0; - - if (!rlen) { + if (page->index > end_index) { clear_highpage(page); SetPageUptodate(page); - goto out; - } - - ret = _readpage(page, true); - if (ret) { - /*SetPageError was done by _readpage. Is it ok?*/ - unlock_page(page); - EXOFS_DBGMSG("__readpage failed\n"); + } else { + ret = _readpage(page, true); + if (ret) { + unlock_page(page); + EXOFS_DBGMSG("__readpage failed\n"); + } } } -out: - if (unlikely(ret)) - _write_failed(mapping->host, pos + len); - return ret; } @@ -929,18 +914,25 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - /* According to comment in simple_write_end i_mutex is held */ - loff_t i_size = inode->i_size; - int ret; + loff_t last_pos = pos + copied; - ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); - if (unlikely(ret)) - _write_failed(inode, pos + len); - - /* TODO: once simple_write_end marks inode dirty remove */ - if (i_size != inode->i_size) + if (!PageUptodate(page)) { + if (copied < len) { + _write_failed(inode, pos + len); + copied = 0; + goto out; + } + SetPageUptodate(page); + } + if (last_pos > inode->i_size) { + i_size_write(inode, last_pos); mark_inode_dirty(inode); - return ret; + } + set_page_dirty(page); +out: + unlock_page(page); + put_page(page); + return copied; } static int exofs_releasepage(struct page *page, gfp_t gfp) From 04fff6416cb7876091f0b2f413caf43e3618d5ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 29 Aug 2016 22:39:56 -0400 Subject: [PATCH 31/40] simple_write_end(): don't zero in short copy into uptodate Signed-off-by: Al Viro --- fs/libfs.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 48826d4da189..76048705d922 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -465,6 +465,8 @@ EXPORT_SYMBOL(simple_write_begin); * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. + * + * Use *ONLY* with simple_readpage() */ int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -474,14 +476,14 @@ int simple_write_end(struct file *file, struct address_space *mapping, loff_t last_pos = pos + copied; /* zero the stale part of the page if we did a short copy */ - if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); + if (!PageUptodate(page)) { + if (copied < len) { + unsigned from = pos & (PAGE_SIZE - 1); - zero_user(page, from + copied, len - copied); - } - - if (!PageUptodate(page)) + zero_user(page, from + copied, len - copied); + } SetPageUptodate(page); + } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. From 84e40080bd6f363ddbcab75b04cb7bc742efbf12 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:13:09 -0800 Subject: [PATCH 32/40] ocfs2: convert inode refcount test to a helper Replace the open-coded inode refcount flag test with a helper function to reduce the potential for bugs. Signed-off-by: Darrick J. Wong --- fs/ocfs2/alloc.c | 3 +-- fs/ocfs2/file.c | 7 +++---- fs/ocfs2/inode.h | 6 ++++++ fs/ocfs2/move_extents.c | 10 ++-------- fs/ocfs2/refcounttree.c | 22 +++++++++------------- fs/ocfs2/xattr.c | 4 ++-- 6 files changed, 23 insertions(+), 29 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f72712f6c28d..a0ca49f09880 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5713,8 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_refcount_tree *ref_tree = NULL; if ((flags & OCFS2_EXT_REFCOUNTED) && len) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); if (!refcount_tree_locked) { ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234d7bbd..d261f3a91870 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1030,7 +1030,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, * Only quota files call this without a bh, and they can't be * refcounted. */ - BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode)); BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); @@ -1719,8 +1719,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, * within one cluster(means is not exactly aligned to clustersize). */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { - + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); if (ret) { mlog_errno(ret); @@ -2036,7 +2035,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + !ocfs2_is_refcount_inode(inode) || OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5af68fcdf9d3..9b955f732bca 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -181,4 +181,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); } +/* Does this inode have the reflink flag set? */ +static inline bool ocfs2_is_refcount_inode(struct inode *inode) +{ + return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 4e8f32eb0bdb..e52a2852d50d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -235,10 +235,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, @@ -581,10 +578,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19238512a324..3410eb105b0d 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -410,7 +410,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) goto out; } - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); @@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, u32 num_got; u64 suballoc_loc, first_blkno; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -708,7 +708,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); @@ -775,7 +775,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); @@ -2299,11 +2299,10 @@ int ocfs2_decrease_refcount(struct inode *inode, { int ret; u64 ref_blkno; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { @@ -2533,7 +2532,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *ref_blocks) { int ret; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); @@ -2544,7 +2542,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, goto out; } - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); @@ -3412,14 +3410,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, { int ret; u32 cow_start = 0, cow_len = 0; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, @@ -3629,11 +3626,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, @@ -3807,7 +3803,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index cb157a34a656..3c5384d9b3a5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2577,7 +2577,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); @@ -3608,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode, } /* Check whether the value is refcounted and do some preparation. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + if (ocfs2_is_refcount_inode(inode) && (!xis.not_found || !xbs.not_found)) { ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, &xis, &xbs, &ref_tree, From 86544fbd853c49a9eccb3d0f4e7eb9317f3fccf9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:13:09 -0800 Subject: [PATCH 33/40] ocfs2: add newlines to some error messages These two error messages are missing the trailing newline. Signed-off-by: Darrick J. Wong --- fs/ocfs2/alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a0ca49f09880..d4ec0d8961a6 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle, rec = &el->l_recs[index]; if (new_flags && (rec->e_flags & new_flags)) { mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " - "extent that already had them", + "extent that already had them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), new_flags); goto out; @@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle, if (clear_flags && !(rec->e_flags & clear_flags)) { mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " - "extent that didn't have them", + "extent that didn't have them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), clear_flags); goto out; From 06a70305812c3973c66824f26223656283c59b27 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:13:10 -0800 Subject: [PATCH 34/40] ocfs2: prohibit refcounted swapfiles The swapfile mechanism calls bmap once to find all the swap file mappings, which means that we cannot properly support CoW remapping. Therefore, error out if the swap code tries to call bmap on a refcounted file. Signed-off-by: Darrick J. Wong --- fs/ocfs2/aops.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..4d037db84be5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)block); + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseѕ the file system for actual I/O. We really can't allow + * that on refcounted inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (ocfs2_is_refcount_inode(inode)) + return 0; + /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. */ From 3e10b793fc40dfdbe51762e0d084bd6f2c8acaaa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:13:11 -0800 Subject: [PATCH 35/40] ocfs2: budget for extent tree splits when adding refcount flag When we're adding the refcount flag to an extent, we have to budget enough space to handle a full extent btree split in addition to whatever modifications have to be made to the refcount btree. We don't currently do this, with the result that generic/186 crashes when we need an extent split but not a refcount split because meta_ac never gets allocated. Signed-off-by: Darrick J. Wong --- fs/ocfs2/refcounttree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3410eb105b0d..6c98d567ba01 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3692,6 +3692,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; + /* We need to be able to handle at least an extent tree split. */ + ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, From 085549553dca86c866f26d233d9cfe19f169c288 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:42:49 -0800 Subject: [PATCH 36/40] ocfs2: don't eat io errors during _dio_end_io_write ocfs2_dio_end_io_write eats whatever errors may happen, which means that write errors do not propagate to userspace. Fix that. Signed-off-by: Darrick J. Wong --- fs/ocfs2/aops.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4d037db84be5..136a49cabc12 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2263,10 +2263,10 @@ out: return ret; } -static void ocfs2_dio_end_io_write(struct inode *inode, - struct ocfs2_dio_write_ctxt *dwc, - loff_t offset, - ssize_t bytes) +static int ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; @@ -2374,6 +2374,8 @@ out: if (locked) inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); + + return ret; } /* @@ -2388,6 +2390,7 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; + int ret = 0; if (bytes <= 0) return 0; @@ -2396,13 +2399,13 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); if (private) - ocfs2_dio_end_io_write(inode, private, offset, bytes); + ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - return 0; + return ret; } static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) From dbf896fc286a62bf215b904c6ff5d197df93e685 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 1 Dec 2016 16:31:14 -0800 Subject: [PATCH 37/40] ocfs2: always unlock when completing dio writes Always unlock the inode when completing dio writes, even if an error has occurrred. The caller already checks the inode and unlocks it if needed, so we might as well reduce contention. Signed-off-by: Darrick J. Wong --- fs/ocfs2/aops.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 136a49cabc12..3c531f108a21 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2392,13 +2392,10 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, int level; int ret = 0; - if (bytes <= 0) - return 0; - /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (private) + if (bytes > 0 && private) ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_iocb_clear_rw_locked(iocb); From aef73a61c01a4dca3f26c22df05039f78fe9d468 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 9 Dec 2016 16:10:15 -0800 Subject: [PATCH 38/40] ocfs2: fix bad pointer cast generic/188 triggered a dmesg stack trace because the dio completion was casting a buffer head to an on-disk inode, which is whacky. Signed-off-by: Darrick J. Wong --- fs/ocfs2/aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3c531f108a21..3372d82d12b6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2317,7 +2317,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode, mlog_errno(ret); } - di = (struct ocfs2_dinode *)di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); From 86e59436d406d833a5da4a94aefb3c3be6b26053 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 22 Nov 2016 13:40:27 -0800 Subject: [PATCH 39/40] ocfs2: charge quota for reflinked blocks When ocfs2 shares blocks from one file to another, it's necessary to charge that many blocks to the quota because ocfs2 tallies block charges according to the number of blocks mapped, not the number of physical blocks used. Without this patch, reflinking X blocks and then CoWing all of them causes quota usage to *decrease* by X as seen in generic/305. Signed-off-by: Darrick J. Wong --- fs/ocfs2/refcounttree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6c98d567ba01..dc8089af9ddf 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3933,6 +3933,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, num_clusters)); if (ret) mlog_errno(ret); From 29ac8e856cb3694e004037de595dec4ec53d42f2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Nov 2016 14:13:11 -0800 Subject: [PATCH 40/40] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features Connect the new VFS clone_range, copy_range, and dedupe_range features to the existing reflink capability of ocfs2. Compared to the existing ocfs2 reflink ioctl We have to do things a little differently to support the VFS semantics (we can clone subranges of a file but we don't clone xattrs), but the VFS ioctls are more broadly supported. Signed-off-by: Darrick J. Wong --- v2: Convert inline data files to extents files before reflinking, and fix i_blocks so that stat(2) output is correct. v3: Make zero-length dedupe consistent with btrfs behavior. v4: Use VFS double-inode lock routines and remove MAX_DEDUPE_LEN. --- fs/ocfs2/file.c | 35 +++- fs/ocfs2/file.h | 3 + fs/ocfs2/refcounttree.c | 432 ++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 7 + 4 files changed, 474 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d261f3a91870..c4889655d32b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *done = ret; } -static int ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, u64 byte_start, - u64 byte_len) +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) { int ret = 0, flags = 0, done = 0, i; u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; @@ -2439,6 +2439,31 @@ out: return offset; } +static int ocfs2_file_clone_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +static ssize_t ocfs2_file_dedupe_range(struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2478,6 +2503,8 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops = { @@ -2523,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e8c62f22215c..897fd9a2e51d 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, size_t count); +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index dc8089af9ddf..b18465e330b1 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -34,6 +34,7 @@ #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" +#include "file.h" #include #include @@ -4448,3 +4449,434 @@ out: return error; } + +/* Update destination inode size, if necessary. */ +static int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) +{ + handle_t *handle; + int ret; + + dest->i_blocks = ocfs2_inode_sector_count(dest); + + if (newlen <= i_size_read(dest)) + return 0; + + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + /* Extend i_size if needed. */ + spin_lock(&OCFS2_I(dest)->ip_lock); + if (newlen > i_size_read(dest)) + i_size_write(dest, newlen); + spin_unlock(&OCFS2_I(dest)->ip_lock); + dest->i_ctime = dest->i_mtime = current_time(dest); + + ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); + return ret; +} + +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ +static int ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_extent_tree s_et; + struct ocfs2_extent_tree t_et; + struct ocfs2_dinode *dis; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_super *osb; + loff_t pstart, plen; + u32 p_cluster, num_clusters, slast, spos, tpos; + unsigned int ext_flags; + int ret = 0; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); + + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); + + while (spos < slast) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + /* Look up the extent. */ + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_clusters = min_t(u32, num_clusters, slast - spos); + + /* Punch out the dest range. */ + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (p_cluster == 0) + goto next_loop; + + /* Lock the refcount btree... */ + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(dis->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Mark s_inode's extent as refcounted. */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(s_inode, &s_et, + &ref_tree->rf_ci, + ref_root_bh, spos, + p_cluster, num_clusters, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + } + + /* Map in the new extent. */ + ext_flags |= OCFS2_EXT_REFCOUNTED; + ret = ocfs2_add_refcounted_extent(t_inode, &t_et, + &ref_tree->rf_ci, + ref_root_bh, + tpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +next_loop: + spos += num_clusters; + tpos += num_clusters; + } + +out: + return ret; +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +/* Set up refcount tree and remap s_inode to t_inode. */ +static int ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb; + struct ocfs2_dinode *dis; + struct ocfs2_dinode *dit; + int ret; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + dit = (struct ocfs2_dinode *)t_bh->b_data; + ocfs2_init_dealloc_ctxt(&dealloc); + + /* + * If we're reflinking the entire file and the source is inline + * data, just copy the contents. + */ + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && + i_size_read(t_inode) <= len && + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * If both inodes belong to two different refcount groups then + * forget it because we don't know how (or want) to go merging + * refcount trees. + */ + ret = -EOPNOTSUPP; + if (ocfs2_is_refcount_inode(s_inode) && + ocfs2_is_refcount_inode(t_inode) && + le64_to_cpu(dis->i_refcount_loc) != + le64_to_cpu(dit->i_refcount_loc)) + goto out; + + /* Neither inode has a refcount tree. Add one to s_inode. */ + if (!ocfs2_is_refcount_inode(s_inode) && + !ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_create_refcount_tree(s_inode, s_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Ensure that both inodes end up with the same refcount tree. */ + if (!ocfs2_is_refcount_inode(s_inode)) { + ret = ocfs2_set_refcount_tree(s_inode, s_bh, + le64_to_cpu(dit->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + if (!ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(dis->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Turn off inline data in the dest file. */ + if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Actually remap extents now. */ + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, + pos_out, len, &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +/* Lock an inode and grab a bh pointing to the inode. */ +static int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) +{ + struct inode *inode1; + struct inode *inode2; + struct ocfs2_inode_info *oi1; + struct ocfs2_inode_info *oi2; + bool same_inode = (s_inode == t_inode); + int status; + + /* First grab the VFS and rw locks. */ + lock_two_nondirectories(s_inode, t_inode); + inode1 = s_inode; + inode2 = t_inode; + if (inode1->i_ino > inode2->i_ino) + swap(inode1, inode2); + + status = ocfs2_rw_lock(inode1, 1); + if (status) { + mlog_errno(status); + goto out_i1; + } + if (!same_inode) { + status = ocfs2_rw_lock(inode2, 1); + if (status) { + mlog_errno(status); + goto out_i2; + } + } + + /* Now go for the cluster locks */ + oi1 = OCFS2_I(inode1); + oi2 = OCFS2_I(inode2); + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* We always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno > oi2->ip_blkno) + mlog_errno(-ENOLCK); + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_rw2; + } + + /* lock id2 */ + if (!same_inode) { + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_cl1; + } + } else + *bh2 = *bh1; + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + + return 0; + +out_cl1: + ocfs2_inode_unlock(inode1, 1); + brelse(*bh1); + *bh1 = NULL; +out_rw2: + ocfs2_rw_unlock(inode2, 1); +out_i2: + ocfs2_rw_unlock(inode1, 1); +out_i1: + unlock_two_nondirectories(s_inode, t_inode); + return status; +} + +/* Unlock both inodes and release buffers. */ +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + ocfs2_inode_unlock(s_inode, 1); + ocfs2_rw_unlock(s_inode, 1); + brelse(s_bh); + if (s_inode != t_inode) { + ocfs2_inode_unlock(t_inode, 1); + ocfs2_rw_unlock(t_inode, 1); + brelse(t_bh); + } + unlock_two_nondirectories(s_inode, t_inode); +} + +/* Link a range of blocks from one file to another. */ +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + ssize_t ret; + + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret || len == 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, + out_bh, pos_out, len); + + /* Zap any page cache for the destination file's range. */ + if (!ret) + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return 0; + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 6422bbcdb525..4af55bf4b35b 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe); + #endif /* OCFS2_REFCOUNTTREE_H */