From 8f11538ebe984e5434eeda4c7183d165cddb5936 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jan 2020 10:14:09 -0500 Subject: [PATCH 01/69] do_add_mount(): lift lock_mount/unlock_mount into callers preparation to finish_automount() fix (next commit) Signed-off-by: Al Viro --- fs/namespace.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 85b5f7bea82e..dcd015fafe01 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2697,45 +2697,32 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, + struct path *path, int mnt_flags) { - struct mountpoint *mp; - struct mount *parent; - int err; + struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; - mp = lock_mount(path); - if (IS_ERR(mp)) - return PTR_ERR(mp); - - parent = real_mount(path->mnt); - err = -EINVAL; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) - goto unlock; + return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) - goto unlock; + return -EINVAL; } /* Refuse the same filesystem on the same mount point */ - err = -EBUSY; if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) - goto unlock; + return -EBUSY; - err = -EINVAL; if (d_is_symlink(newmnt->mnt.mnt_root)) - goto unlock; + return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, parent, mp); - -unlock: - unlock_mount(mp); - return err; + return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -2748,6 +2735,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; + struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; @@ -2768,7 +2756,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + mp = lock_mount(mountpoint); + if (IS_ERR(mp)) { + mntput(mnt); + return PTR_ERR(mp); + } + error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); + unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -2830,6 +2824,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, struct path *path) { struct mount *mnt = real_mount(m); + struct mountpoint *mp; int err; /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it @@ -2842,7 +2837,13 @@ int finish_automount(struct vfsmount *m, struct path *path) goto fail; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + mp = lock_mount(path); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto fail; + } + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(mp); if (!err) return 0; fail: From 26df6034fdb211857e069e7b07d25a368da646df Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jan 2020 10:44:29 -0500 Subject: [PATCH 02/69] fix automount/automount race properly Protection against automount/automount races (two threads hitting the same referral point at the same time) is based upon do_add_mount() prevention of identical overmounts - trying to overmount the root of mounted tree with the same tree fails with -EBUSY. It's unreliable (the other thread might've mounted something on top of the automount it has triggered) *and* causes no end of headache for follow_automount() and its caller, since finish_automount() behaves like do_new_mount() - if the mountpoint to be is overmounted, it mounts on top what's overmounting it. It's not only wrong (we want to go into what's overmounting the automount point and quietly discard what we planned to mount there), it introduces the possibility of original parent mount getting dropped. That's what 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount) deals with, but it can't do anything about the reliability of conflict detection - if something had been overmounted the other thread's automount (e.g. that other thread having stepped into automount in mount(2)), we don't get that -EBUSY and the result is referral point under automounted NFS under explicit overmount under another copy of automounted NFS What we need is finish_automount() *NOT* digging into overmounts - if it finds one, it should just quietly discard the thing it was asked to mount. And don't bother with actually crossing into the results of finish_automount() - the same loop that calls follow_automount() will do that just fine on the next iteration. IOW, instead of calling lock_mount() have finish_automount() do it manually, _without_ the "move into overmount and retry" part. And leave crossing into the results to the caller of follow_automount(), which simplifies it a lot. Moral: if you end up with a lot of glue working around the calling conventions of something, perhaps these calling conventions are simply wrong... Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount) Signed-off-by: Al Viro --- fs/namei.c | 29 ++++------------------------- fs/namespace.c | 40 +++++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index db6565c99825..626eddb33508 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1208,11 +1208,9 @@ EXPORT_SYMBOL(follow_up); * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, struct nameidata *nd, - bool *need_mntput) +static int follow_automount(struct path *path, struct nameidata *nd) { struct vfsmount *mnt; - int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) return -EREMOTE; @@ -1253,29 +1251,10 @@ static int follow_automount(struct path *path, struct nameidata *nd, return PTR_ERR(mnt); } - if (!mnt) /* mount collision */ + if (!mnt) return 0; - if (!*need_mntput) { - /* lock_mount() may release path->mnt on error */ - mntget(path->mnt); - *need_mntput = true; - } - err = finish_automount(mnt, path); - - switch (err) { - case -EBUSY: - /* Someone else made a mount here whilst we were busy */ - return 0; - case 0: - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - return 0; - default: - return err; - } - + return finish_automount(mnt, path); } /* @@ -1333,7 +1312,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) /* Handle an automount point */ if (flags & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, nd, &need_mntput); + ret = follow_automount(path, nd); if (ret < 0) break; continue; diff --git a/fs/namespace.c b/fs/namespace.c index dcd015fafe01..777c3116e62e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2823,6 +2823,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, struct path *path) { + struct dentry *dentry = path->dentry; struct mount *mnt = real_mount(m); struct mountpoint *mp; int err; @@ -2832,21 +2833,46 @@ int finish_automount(struct vfsmount *m, struct path *path) BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == path->dentry) { + m->mnt_root == dentry) { err = -ELOOP; - goto fail; + goto discard; } - mp = lock_mount(path); + /* + * we don't want to use lock_mount() - in this case finding something + * that overmounts our mountpoint to be means "quitely drop what we've + * got", not "try to mount it on top". + */ + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) { + err = -ENOENT; + goto discard_locked; + } + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, dentry))) { + rcu_read_unlock(); + err = 0; + goto discard_locked; + } + rcu_read_unlock(); + mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); - goto fail; + goto discard_locked; } + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); unlock_mount(mp); - if (!err) - return 0; -fail: + if (unlikely(err)) + goto discard; + mntput(m); + return 0; + +discard_locked: + namespace_unlock(); + inode_unlock(dentry->d_inode); +discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); From 25e195aa1e607f129ab912d29fcfc79239703307 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jan 2020 11:27:46 -0500 Subject: [PATCH 03/69] follow_automount(): get rid of dead^Wstillborn code 1) no instances of ->d_automount() have ever made use of the "return ERR_PTR(-EISDIR) if you don't feel like mounting anything" - that's a rudiment of plans that got superseded before the thing went into the tree. Despite the comment in follow_automount(), autofs has never done that. 2) if there's no ->d_automount() in dentry_operations, filesystems should not set DCACHE_NEED_AUTOMOUNT in the first place. None have ever done so... Signed-off-by: Al Viro --- fs/namei.c | 28 +++------------------------- fs/namespace.c | 9 ++++++++- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 626eddb33508..39dd56f5171f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1210,10 +1210,7 @@ EXPORT_SYMBOL(follow_up); */ static int follow_automount(struct path *path, struct nameidata *nd) { - struct vfsmount *mnt; - - if (!path->dentry->d_op || !path->dentry->d_op->d_automount) - return -EREMOTE; + struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to @@ -1228,33 +1225,14 @@ static int follow_automount(struct path *path, struct nameidata *nd) */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) + dentry->d_inode) return -EISDIR; nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; - mnt = path->dentry->d_op->d_automount(path); - if (IS_ERR(mnt)) { - /* - * The filesystem is allowed to return -EISDIR here to indicate - * it doesn't want to automount. For instance, autofs would do - * this so that its userspace daemon can mount on this dentry. - * - * However, we can only permit this if it's a terminal point in - * the path being looked up; if it wasn't then the remainder of - * the path is inaccessible and we should say so. - */ - if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) - return -EREMOTE; - return PTR_ERR(mnt); - } - - if (!mnt) - return 0; - - return finish_automount(mnt, path); + return finish_automount(dentry->d_op->d_automount(path), path); } /* diff --git a/fs/namespace.c b/fs/namespace.c index 777c3116e62e..743980380a8f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2824,9 +2824,16 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, struct path *path) { struct dentry *dentry = path->dentry; - struct mount *mnt = real_mount(m); struct mountpoint *mp; + struct mount *mnt; int err; + + if (!m) + return 0; + if (IS_ERR(m)) + return PTR_ERR(m); + + mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ From 1c9f5e06a613cc48608db1a4207b8bd5ebe70a81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jan 2020 22:05:18 -0500 Subject: [PATCH 04/69] follow_automount() doesn't need the entire nameidata Only the address of ->total_link_count and the flags. And fix an off-by-one is ELOOP detection - make it consistent with symlink following, where we check if the pre-increment value has reached 40, rather than check the post-increment one. [kudos to Christian Brauner for spotted braino] Signed-off-by: Al Viro --- fs/namei.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 39dd56f5171f..6721c5f7e9d5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1208,7 +1208,7 @@ EXPORT_SYMBOL(follow_up); * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, struct nameidata *nd) +static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; @@ -1223,13 +1223,12 @@ static int follow_automount(struct path *path, struct nameidata *nd) * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; - nd->total_link_count++; - if (nd->total_link_count >= 40) + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); @@ -1290,7 +1289,8 @@ static int follow_managed(struct path *path, struct nameidata *nd) /* Handle an automount point */ if (flags & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, nd); + ret = follow_automount(path, &nd->total_link_count, + nd->flags); if (ret < 0) break; continue; From 31d1726d7250021c66c9f16d8a128444676db782 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jan 2020 20:19:38 -0500 Subject: [PATCH 05/69] make build_open_flags() treat O_CREAT | O_EXCL as implying O_NOFOLLOW O_CREAT | O_EXCL means "-EEXIST if we run into a trailing symlink". As it is, we might or might not have LOOKUP_FOLLOW in op->intent in that case - that depends upon having O_NOFOLLOW in open flags. It doesn't matter, since we won't be checking it in that case - do_last() bails out earlier. However, making sure it's not set (i.e. acting as if we had an explicit O_NOFOLLOW) makes the behaviour more explicit and allows to reorder the check for O_CREAT | O_EXCL in do_last() with the call of step_into() immediately following it. Signed-off-by: Al Viro --- fs/namei.c | 15 +++++---------- fs/open.c | 4 +++- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6721c5f7e9d5..6938d20aa73a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3396,22 +3396,17 @@ static int do_last(struct nameidata *nd, if (unlikely(error < 0)) return error; - /* - * create/update audit record if it already exists. - */ - audit_inode(nd->name, path.dentry, 0); - - if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { - path_to_nameidata(&path, nd); - return -EEXIST; - } - seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; + + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { + audit_inode(nd->name, nd->path.dentry, 0); + return -EEXIST; + } finish_open: /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); diff --git a/fs/open.c b/fs/open.c index 0788b3715731..e5227cd533f4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1049,8 +1049,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; - if (flags & O_EXCL) + if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; + flags |= O_NOFOLLOW; + } } if (flags & O_DIRECTORY) From bd7c4b508344680c843e2d2436d56b9fc8aedc9d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jan 2020 20:37:23 -0500 Subject: [PATCH 06/69] handle_mounts(): start building a sane wrapper for follow_managed() All callers of follow_managed() follow it on success with the same steps - d_backing_inode(path->dentry) is calculated and stored into some struct inode * variable and, in all but one case, an unsigned variable (nd->seq to be) is zeroed. The single exception is lookup_fast() and there zeroing is correct thing to do - not doing it is a pointless microoptimization. Add a wrapper for follow_managed() that would do that combination. It's mostly a vehicle for code massage - it will be changing quite a bit, and the current calling conventions are by no means final. Right now it takes path, nameidata and (as out params) inode and seq, similar to __follow_mount_rcu(). Which will soon get folded into it... Signed-off-by: Al Viro --- fs/namei.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6938d20aa73a..c104ec75faef 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1385,6 +1385,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } +static inline int handle_mounts(struct path *path, struct nameidata *nd, + struct inode **inode, unsigned int *seqp) +{ + int ret = follow_managed(path, nd); + + if (likely(ret >= 0)) { + *inode = d_backing_inode(path->dentry); + *seqp = 0; /* out of RCU mode, so the value doesn't matter */ + } + return ret; +} + static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; @@ -1607,7 +1619,6 @@ static int lookup_fast(struct nameidata *nd, struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int status = 1; - int err; /* * Rename seqlock is not required here because in the off chance @@ -1677,10 +1688,7 @@ static int lookup_fast(struct nameidata *nd, path->mnt = mnt; path->dentry = dentry; - err = follow_managed(path, nd); - if (likely(err > 0)) - *inode = d_backing_inode(path->dentry); - return err; + return handle_mounts(path, nd, inode, seqp); } /* Fast lookup failed, do it the slow way */ @@ -1875,12 +1883,9 @@ static int walk_component(struct nameidata *nd, int flags) return PTR_ERR(path.dentry); path.mnt = nd->path.mnt; - err = follow_managed(&path, nd); + err = handle_mounts(&path, nd, &inode, &seq); if (unlikely(err < 0)) return err; - - seq = 0; /* we are already out of RCU mode */ - inode = d_backing_inode(path.dentry); } return step_into(nd, &path, flags, inode, seq); @@ -2365,11 +2370,9 @@ static int handle_lookup_down(struct nameidata *nd) return -ECHILD; } else { dget(path.dentry); - err = follow_managed(&path, nd); + err = handle_mounts(&path, nd, &inode, &seq); if (unlikely(err < 0)) return err; - inode = d_backing_inode(path.dentry); - seq = 0; } path_to_nameidata(&path, nd); nd->inode = inode; @@ -3392,12 +3395,9 @@ static int do_last(struct nameidata *nd, got_write = false; } - error = follow_managed(&path, nd); + error = handle_mounts(&path, nd, &inode, &seq); if (unlikely(error < 0)) return error; - - seq = 0; /* out of RCU mode, so the value doesn't matter */ - inode = d_backing_inode(path.dentry); finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) From 239eb983383b1b140e588fe8f1c7d1ce953285f6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:12:40 -0500 Subject: [PATCH 07/69] atomic_open(): saner calling conventions (return dentry on success) Currently it either returns -E... or puts (nd->path.mnt,dentry) into *path and returns 0. Make it return ERR_PTR(-E...) or dentry; adjust the caller. Fewer arguments and it's easier to keep track of *path contents that way. Signed-off-by: Al Viro --- fs/namei.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c104ec75faef..5f8b791a6d6e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3087,10 +3087,10 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m * * Returns an error code otherwise. */ -static int atomic_open(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct file *file, - const struct open_flags *op, - int open_flag, umode_t mode) +static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, + struct file *file, + const struct open_flags *op, + int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; @@ -3131,17 +3131,15 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, } if (file->f_mode & FMODE_CREATED) fsnotify_create(dir, dentry); - if (unlikely(d_is_negative(dentry))) { + if (unlikely(d_is_negative(dentry))) error = -ENOENT; - } else { - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; - } } } - dput(dentry); - return error; + if (error) { + dput(dentry); + dentry = ERR_PTR(error); + } + return dentry; } /* @@ -3236,11 +3234,20 @@ static int lookup_open(struct nameidata *nd, struct path *path, } if (dir_inode->i_op->atomic_open) { - error = atomic_open(nd, dentry, path, file, op, open_flag, - mode); - if (unlikely(error == -ENOENT) && create_error) - error = create_error; - return error; + dentry = atomic_open(nd, dentry, file, op, open_flag, mode); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + if (unlikely(error == -ENOENT) && create_error) + error = create_error; + return error; + } + if (file->f_mode & FMODE_OPENED) { + dput(dentry); + return 0; + } + path->mnt = nd->path.mnt; + path->dentry = dentry; + return 0; } no_open: From da5ebf5aa676c4106d1e7032d22679831cb6ba72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:25:14 -0500 Subject: [PATCH 08/69] lookup_open(): saner calling conventions (return dentry on success) same story as for atomic_open() in the previous commit. Signed-off-by: Al Viro --- fs/namei.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 5f8b791a6d6e..b350e1e2b46f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3157,10 +3157,9 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, * * An error code is returned on failure. */ -static int lookup_open(struct nameidata *nd, struct path *path, - struct file *file, - const struct open_flags *op, - bool got_write) +static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + const struct open_flags *op, + bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -3171,7 +3170,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) - return -ENOENT; + return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); @@ -3179,7 +3178,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return dentry; } if (d_in_lookup(dentry)) break; @@ -3195,7 +3194,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ - goto out_no_open; + return dentry; } /* @@ -3235,19 +3234,9 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, op, open_flag, mode); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - if (unlikely(error == -ENOENT) && create_error) - error = create_error; - return error; - } - if (file->f_mode & FMODE_OPENED) { - dput(dentry); - return 0; - } - path->mnt = nd->path.mnt; - path->dentry = dentry; - return 0; + if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) + dentry = ERR_PTR(create_error); + return dentry; } no_open: @@ -3283,14 +3272,11 @@ no_open: error = create_error; goto out_dput; } -out_no_open: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; + return dentry; out_dput: dput(dentry); - return error; + return ERR_PTR(error); } /* @@ -3309,6 +3295,7 @@ static int do_last(struct nameidata *nd, unsigned seq; struct inode *inode; struct path path; + struct dentry *dentry; int error; nd->flags &= ~LOOKUP_PARENT; @@ -3365,14 +3352,18 @@ static int do_last(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - error = lookup_open(nd, &path, file, op, got_write); + dentry = lookup_open(nd, file, op, got_write); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); - if (error) + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); goto out; + } + path.mnt = nd->path.mnt; + path.dentry = dentry; if (file->f_mode & FMODE_OPENED) { if ((file->f_mode & FMODE_CREATED) || @@ -3380,6 +3371,7 @@ static int do_last(struct nameidata *nd, will_truncate = false; audit_inode(nd->name, file->f_path.dentry, 0); + dput(dentry); goto opened; } From e73cabff591706d7eb643d4c6981a236cb9d76ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:30:08 -0500 Subject: [PATCH 09/69] do_last(): collapse the call of path_to_nameidata() ... and shift filling struct path to just before the call of handle_mounts(). All callers of handle_mounts() are immediately preceded by path->mnt = nd->path.mnt now. Signed-off-by: Al Viro --- fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index b350e1e2b46f..79029a1048be 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3362,8 +3362,6 @@ static int do_last(struct nameidata *nd, error = PTR_ERR(dentry); goto out; } - path.mnt = nd->path.mnt; - path.dentry = dentry; if (file->f_mode & FMODE_OPENED) { if ((file->f_mode & FMODE_CREATED) || @@ -3380,7 +3378,8 @@ static int do_last(struct nameidata *nd, open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = 0; - path_to_nameidata(&path, nd); + dput(nd->path.dentry); + nd->path.dentry = dentry; goto finish_open_created; } @@ -3394,6 +3393,8 @@ static int do_last(struct nameidata *nd, got_write = false; } + path.mnt = nd->path.mnt; + path.dentry = dentry; error = handle_mounts(&path, nd, &inode, &seq); if (unlikely(error < 0)) return error; From db3c9ade50b1f210c750886c23ba40dbf44b2bc8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:41:00 -0500 Subject: [PATCH 10/69] handle_mounts(): pass dentry in, turn path into a pure out argument All callers are equivalent to path->dentry = dentry; path->mnt = nd->path.mnt; err = handle_mounts(path, ...) Pass dentry as an explicit argument, fill *path in handle_mounts() itself. Signed-off-by: Al Viro --- fs/namei.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 79029a1048be..a613fa52a174 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1385,11 +1385,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } -static inline int handle_mounts(struct path *path, struct nameidata *nd, - struct inode **inode, unsigned int *seqp) +static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, + struct path *path, struct inode **inode, + unsigned int *seqp) { - int ret = follow_managed(path, nd); + int ret; + path->mnt = nd->path.mnt; + path->dentry = dentry; + ret = follow_managed(path, nd); if (likely(ret >= 0)) { *inode = d_backing_inode(path->dentry); *seqp = 0; /* out of RCU mode, so the value doesn't matter */ @@ -1685,10 +1689,7 @@ static int lookup_fast(struct nameidata *nd, dput(dentry); return status; } - - path->mnt = mnt; - path->dentry = dentry; - return handle_mounts(path, nd, inode, seqp); + return handle_mounts(nd, dentry, path, inode, seqp); } /* Fast lookup failed, do it the slow way */ @@ -1859,6 +1860,7 @@ static inline int step_into(struct nameidata *nd, struct path *path, static int walk_component(struct nameidata *nd, int flags) { struct path path; + struct dentry *dentry; struct inode *inode; unsigned seq; int err; @@ -1877,13 +1879,11 @@ static int walk_component(struct nameidata *nd, int flags) if (unlikely(err <= 0)) { if (err < 0) return err; - path.dentry = lookup_slow(&nd->last, nd->path.dentry, - nd->flags); - if (IS_ERR(path.dentry)) - return PTR_ERR(path.dentry); + dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - path.mnt = nd->path.mnt; - err = handle_mounts(&path, nd, &inode, &seq); + err = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(err < 0)) return err; } @@ -2355,7 +2355,7 @@ static inline int lookup_last(struct nameidata *nd) static int handle_lookup_down(struct nameidata *nd) { - struct path path = nd->path; + struct path path; struct inode *inode = nd->inode; unsigned seq = nd->seq; int err; @@ -2366,11 +2366,12 @@ static int handle_lookup_down(struct nameidata *nd) * at the very beginning of walk, so we lose nothing * if we simply redo everything in non-RCU mode */ + path = nd->path; if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) return -ECHILD; } else { - dget(path.dentry); - err = handle_mounts(&path, nd, &inode, &seq); + dget(nd->path.dentry); + err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq); if (unlikely(err < 0)) return err; } @@ -3393,9 +3394,7 @@ static int do_last(struct nameidata *nd, got_write = false; } - path.mnt = nd->path.mnt; - path.dentry = dentry; - error = handle_mounts(&path, nd, &inode, &seq); + error = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(error < 0)) return error; finish_lookup: From b023e1728bec6dbf0c6a7e2145e2cd025c89cd55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jan 2020 22:04:43 -0500 Subject: [PATCH 11/69] lookup_fast(): consolidate the RCU success case 1) in case of __follow_mount_rcu() failure, lookup_fast() proceeds to call unlazy_child() and, should it succeed, handle_mounts(). Note that we have status > 0 (or we wouldn't be calling __follow_mount_rcu() at all), so all stuff conditional upon non-positive status won't be even touched. Consolidate just that sequence after the call of __follow_mount_rcu(). 2) calling d_is_negative() and keeping its result is pointless - we either don't get past checking ->d_seq (and don't use the results of d_is_negative() at all), or we are guaranteed that ->d_inode and type bits of ->d_flags had been consistent at the time of d_is_negative() call. IOW, we could only get to the use of its result if it's equal to !inode. The same ->d_seq check guarantees that after that point this CPU won't observe ->d_flags values older than ->d_inode update. So 'negative' variable is completely pointless these days. Signed-off-by: Al Viro --- fs/namei.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index a613fa52a174..c34c27025cf0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1631,7 +1631,6 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { if (unlazy_walk(nd)) @@ -1644,7 +1643,6 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); - negative = d_is_negative(dentry); if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) return -ECHILD; @@ -1665,12 +1663,15 @@ static int lookup_fast(struct nameidata *nd, * Note: do negative dentry check after revalidation in * case that drops it. */ - if (unlikely(negative)) + if (unlikely(!*inode)) return -ENOENT; path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 1; + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + return handle_mounts(nd, dentry, path, inode, seqp); } if (unlazy_child(nd, dentry, seq)) return -ECHILD; From c153007b7b7a65bb99c19eb1a72cebda5aa26a5e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:50:18 -0500 Subject: [PATCH 12/69] teach handle_mounts() to handle RCU mode ... and make the callers of __follow_mount_rcu() use handle_mounts(). Signed-off-by: Al Viro --- fs/namei.c | 46 +++++++++++++++++----------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c34c27025cf0..37b2f82e6f2c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1393,6 +1393,18 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, path->mnt = nd->path.mnt; path->dentry = dentry; + if (nd->flags & LOOKUP_RCU) { + unsigned int seq = *seqp; + if (unlikely(!*inode)) + return -ENOENT; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 1; + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + // *path might've been clobbered by __follow_mount_rcu() + path->mnt = nd->path.mnt; + path->dentry = dentry; + } ret = follow_managed(path, nd); if (likely(ret >= 0)) { *inode = d_backing_inode(path->dentry); @@ -1620,7 +1632,6 @@ static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { - struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1658,21 +1669,8 @@ static int lookup_fast(struct nameidata *nd, *seqp = seq; status = d_revalidate(dentry, nd->flags); - if (likely(status > 0)) { - /* - * Note: do negative dentry check after revalidation in - * case that drops it. - */ - if (unlikely(!*inode)) - return -ENOENT; - path->mnt = mnt; - path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 1; - if (unlazy_child(nd, dentry, seq)) - return -ECHILD; + if (likely(status > 0)) return handle_mounts(nd, dentry, path, inode, seqp); - } if (unlazy_child(nd, dentry, seq)) return -ECHILD; if (unlikely(status == -ECHILD)) @@ -2361,21 +2359,11 @@ static int handle_lookup_down(struct nameidata *nd) unsigned seq = nd->seq; int err; - if (nd->flags & LOOKUP_RCU) { - /* - * don't bother with unlazy_walk on failure - we are - * at the very beginning of walk, so we lose nothing - * if we simply redo everything in non-RCU mode - */ - path = nd->path; - if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) - return -ECHILD; - } else { + if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq); - if (unlikely(err < 0)) - return err; - } + err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq); + if (unlikely(err < 0)) + return err; path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; From 20e343571cefe31b3dba3e61c9b057781fbce40a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 14:58:31 -0500 Subject: [PATCH 13/69] lookup_fast(): take mount traversal into callers Current calling conventions: -E... on error, 0 on cache miss, result of handle_mounts(nd, dentry, path, inode, seqp) on success. Turn that into returning ERR_PTR(-E...), NULL and dentry resp.; deal with handle_mounts() in the callers. The thing is, they already do that in cache miss handling case, so we just need to supply dentry to them and unify the mount traversal in those cases. Fewer arguments that way, and we get closer to merging handle_mounts() and step_into(). Signed-off-by: Al Viro --- fs/namei.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 37b2f82e6f2c..84ce7ccd944e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1628,9 +1628,9 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd, + struct inode **inode, + unsigned *seqp) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; @@ -1645,8 +1645,8 @@ static int lookup_fast(struct nameidata *nd, dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { if (unlazy_walk(nd)) - return -ECHILD; - return 0; + return ERR_PTR(-ECHILD); + return NULL; } /* @@ -1655,7 +1655,7 @@ static int lookup_fast(struct nameidata *nd, */ *inode = d_backing_inode(dentry); if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); /* * This sequence count validates that the parent had no @@ -1665,30 +1665,30 @@ static int lookup_fast(struct nameidata *nd, * enough, we can use __read_seqcount_retry here. */ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); *seqp = seq; status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) - return handle_mounts(nd, dentry, path, inode, seqp); + return dentry; if (unlazy_child(nd, dentry, seq)) - return -ECHILD; + return ERR_PTR(-ECHILD); if (unlikely(status == -ECHILD)) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) - return 0; + return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); - return status; + return ERR_PTR(status); } - return handle_mounts(nd, dentry, path, inode, seqp); + return dentry; } /* Fast lookup failed, do it the slow way */ @@ -1874,19 +1874,18 @@ static int walk_component(struct nameidata *nd, int flags) put_link(nd); return err; } - err = lookup_fast(nd, &path, &inode, &seq); - if (unlikely(err <= 0)) { - if (err < 0) - return err; + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - - err = handle_mounts(nd, dentry, &path, &inode, &seq); - if (unlikely(err < 0)) - return err; } + err = handle_mounts(nd, dentry, &path, &inode, &seq); + if (unlikely(err < 0)) + return err; return step_into(nd, &path, flags, inode, seq); } @@ -3302,13 +3301,12 @@ static int do_last(struct nameidata *nd, if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, &path, &inode, &seq); - if (likely(error > 0)) + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + if (likely(dentry)) goto finish_lookup; - if (error < 0) - return error; - BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { @@ -3383,10 +3381,10 @@ static int do_last(struct nameidata *nd, got_write = false; } +finish_lookup: error = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(error < 0)) return error; -finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; From 56676ec390194930f7504cd84d494c221861bd7f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Mar 2020 21:54:54 -0400 Subject: [PATCH 14/69] step_into() callers: dismiss the symlink earlier We need to dismiss a symlink when we are done traversing it; currently that's done when we call step_into() for its last component. For the cases when we do not call step_into() for that component (i.e. when it's . or ..) we do the same symlink dismissal after the call of handle_dots(). What we need to guarantee is that the symlink won't be dismissed while we are still using nd->last.name - it's pointing into the body of said symlink. step_into() is sufficiently late - by the time it's called we'd already obtained the dentry, so the name we'd been looking up is no longer needed. However, it turns out to be cleaner to have that ("we are done with that component now, can dismiss the link") done explicitly - in the callers of step_into(). In handle_dots() case we won't be using the component string at all, so for . and .. the corresponding point is actually _before_ the call of handle_dots(), not after it. Fix a minor irregularity in do_last(), while we are at it - if trailing symlink ended with . or .. we forgot to dismiss it. Not a problem, since nameidata is about to be done with (neither . nor .. can be a trailing symlink, so this is the last iteration through the loop) and terminate_walk() will clean the stack anyway, but let's keep it more regular. Signed-off-by: Al Viro --- fs/namei.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 84ce7ccd944e..3097edcb4a1a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1838,8 +1838,6 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2}; static inline int step_into(struct nameidata *nd, struct path *path, int flags, struct inode *inode, unsigned seq) { - if (!(flags & WALK_MORE) && nd->depth) - put_link(nd); if (likely(!d_is_symlink(path->dentry)) || !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { /* not a symlink or should not follow */ @@ -1869,9 +1867,9 @@ static int walk_component(struct nameidata *nd, int flags) * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - err = handle_dots(nd, nd->last_type); if (!(flags & WALK_MORE) && nd->depth) put_link(nd); + err = handle_dots(nd, nd->last_type); return err; } dentry = lookup_fast(nd, &inode, &seq); @@ -1882,6 +1880,8 @@ static int walk_component(struct nameidata *nd, int flags) if (IS_ERR(dentry)) return PTR_ERR(dentry); } + if (!(flags & WALK_MORE) && nd->depth) + put_link(nd); err = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(err < 0)) @@ -3291,6 +3291,8 @@ static int do_last(struct nameidata *nd, nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { + if (nd->depth) + put_link(nd); error = handle_dots(nd, nd->last_type); if (unlikely(error)) return error; @@ -3382,6 +3384,8 @@ static int do_last(struct nameidata *nd, } finish_lookup: + if (nd->depth) + put_link(nd); error = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(error < 0)) return error; From aca2903eefd0f8a3ba672f985182f899b425ca24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 9 Jan 2020 15:17:57 -0500 Subject: [PATCH 15/69] new step_into() flag: WALK_NOFOLLOW Tells step_into() not to follow symlinks, regardless of LOOKUP_FOLLOW. Allows to switch handle_lookup_down() to of step_into(), getting all follow_managed() and step_into() calls paired. Signed-off-by: Al Viro --- fs/namei.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3097edcb4a1a..79f06be7f5d4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1827,7 +1827,7 @@ static int pick_link(struct nameidata *nd, struct path *link, return 1; } -enum {WALK_FOLLOW = 1, WALK_MORE = 2}; +enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; /* * Do we need to follow links? We _really_ want to be able @@ -1839,7 +1839,8 @@ static inline int step_into(struct nameidata *nd, struct path *path, int flags, struct inode *inode, unsigned seq) { if (likely(!d_is_symlink(path->dentry)) || - !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { + !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) || + (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ path_to_nameidata(path, nd); nd->inode = inode; @@ -2363,10 +2364,7 @@ static int handle_lookup_down(struct nameidata *nd) err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq); if (unlikely(err < 0)) return err; - path_to_nameidata(&path, nd); - nd->inode = inode; - nd->seq = seq; - return 0; + return step_into(nd, &path, WALK_NOFOLLOW, inode, seq); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ From cbae4d12eeee6b002a1252c4c45213651e8f4b55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2020 13:40:02 -0500 Subject: [PATCH 16/69] fold handle_mounts() into step_into() The following is true: * calls of handle_mounts() and step_into() are always paired in sequences like err = handle_mounts(nd, dentry, &path, &inode, &seq); if (unlikely(err < 0)) return err; err = step_into(nd, &path, flags, inode, seq); * in all such sequences path is uninitialized before and unused after this pair of calls * in all such sequences inode and seq are unused afterwards. So the call of handle_mounts() can be shifted inside step_into(), turning 'path' into a local variable in the combined function. Signed-off-by: Al Viro --- fs/namei.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 79f06be7f5d4..8a673f44ffc2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1835,29 +1835,33 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int step_into(struct nameidata *nd, struct path *path, - int flags, struct inode *inode, unsigned seq) +static int step_into(struct nameidata *nd, int flags, + struct dentry *dentry, struct inode *inode, unsigned seq) { - if (likely(!d_is_symlink(path->dentry)) || + struct path path; + int err = handle_mounts(nd, dentry, &path, &inode, &seq); + + if (err < 0) + return err; + if (likely(!d_is_symlink(path.dentry)) || !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - path_to_nameidata(path, nd); + path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; return 0; } /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { - if (read_seqcount_retry(&path->dentry->d_seq, seq)) + if (read_seqcount_retry(&path.dentry->d_seq, seq)) return -ECHILD; } - return pick_link(nd, path, inode, seq); + return pick_link(nd, &path, inode, seq); } static int walk_component(struct nameidata *nd, int flags) { - struct path path; struct dentry *dentry; struct inode *inode; unsigned seq; @@ -1883,11 +1887,7 @@ static int walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - - err = handle_mounts(nd, dentry, &path, &inode, &seq); - if (unlikely(err < 0)) - return err; - return step_into(nd, &path, flags, inode, seq); + return step_into(nd, flags, dentry, inode, seq); } /* @@ -2354,17 +2354,10 @@ static inline int lookup_last(struct nameidata *nd) static int handle_lookup_down(struct nameidata *nd) { - struct path path; - struct inode *inode = nd->inode; - unsigned seq = nd->seq; - int err; - if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq); - if (unlikely(err < 0)) - return err; - return step_into(nd, &path, WALK_NOFOLLOW, inode, seq); + return step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3281,7 +3274,6 @@ static int do_last(struct nameidata *nd, int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path path; struct dentry *dentry; int error; @@ -3384,10 +3376,7 @@ static int do_last(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - error = handle_mounts(nd, dentry, &path, &inode, &seq); - if (unlikely(error < 0)) - return error; - error = step_into(nd, &path, 0, inode, seq); + error = step_into(nd, 0, dentry, inode, seq); if (unlikely(error)) return error; From 161aff1d93abf0e5b5e9dbca88928998c155f677 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Jan 2020 22:52:26 -0500 Subject: [PATCH 17/69] LOOKUP_MOUNTPOINT: fold path_mountpointat() into path_lookupat() New LOOKUP flag, telling path_lookupat() to act as path_mountpointat(). IOW, traverse mounts at the final point and skip revalidation of the location where it ends up. Signed-off-by: Al Viro --- fs/autofs/dev-ioctl.c | 6 +-- fs/internal.h | 1 - fs/namei.c | 89 +++---------------------------------------- fs/namespace.c | 4 +- include/linux/namei.h | 2 +- 5 files changed, 12 insertions(+), 90 deletions(-) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index a3cdb0036c5d..f3a0f412b43b 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -186,7 +186,7 @@ static int find_autofs_mount(const char *pathname, struct path path; int err; - err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; @@ -519,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path_mountpoint(AT_FDCWD, - name, &path, LOOKUP_FOLLOW); + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, + &path); else err = find_autofs_mount(name, &path, test_by_type, &type); diff --git a/fs/internal.h b/fs/internal.h index f3f280b952a3..b108a8eb75ca 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,7 +61,6 @@ extern int finish_clean_context(struct fs_context *fc); */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); -extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); long do_mknodat(int dfd, const char __user *filename, umode_t mode, diff --git a/fs/namei.c b/fs/namei.c index 8a673f44ffc2..0fcd738b95b9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2382,6 +2382,10 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) { *path = nd->path; nd->path.mnt = NULL; @@ -2410,7 +2414,8 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, path->dentry, 0); + audit_inode(name, path->dentry, + flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); putname(name); return retval; @@ -2688,88 +2693,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -/** - * path_mountpoint - look up a path to be umounted - * @nd: lookup context - * @flags: lookup flags - * @path: pointer to container for result - * - * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Returns error otherwise. - */ -static int -path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) -{ - const char *s = path_init(nd, flags); - int err; - - while (!(err = link_path_walk(s, nd)) && - (err = lookup_last(nd)) > 0) { - s = trailing_symlink(nd); - } - if (!err && (nd->flags & LOOKUP_RCU)) - err = unlazy_walk(nd); - if (!err) - err = handle_lookup_down(nd); - if (!err) { - *path = nd->path; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - } - terminate_walk(nd); - return err; -} - -static int -filename_mountpoint(int dfd, struct filename *name, struct path *path, - unsigned int flags) -{ - struct nameidata nd; - int error; - if (IS_ERR(name)) - return PTR_ERR(name); - set_nameidata(&nd, dfd, name); - error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); - if (unlikely(error == -ECHILD)) - error = path_mountpoint(&nd, flags, path); - if (unlikely(error == -ESTALE)) - error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); - if (likely(!error)) - audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); - restore_nameidata(); - putname(name); - return error; -} - -/** - * user_path_mountpoint_at - lookup a path from userland in order to umount it - * @dfd: directory file descriptor - * @name: pathname from userland - * @flags: lookup flags - * @path: pointer to container to hold result - * - * A umount is a special case for path walking. We're not actually interested - * in the inode in this situation, and ESTALE errors can be a problem. We - * simply want track down the dentry and vfsmount attached at the mountpoint - * and avoid revalidating the last component. - * - * Returns 0 and populates "path" on success. - */ -int -user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, - struct path *path) -{ - return filename_mountpoint(dfd, getname(name), path, flags); -} - -int -kern_path_mountpoint(int dfd, const char *name, struct path *path, - unsigned int flags) -{ - return filename_mountpoint(dfd, getname_kernel(name), path, flags); -} -EXPORT_SYMBOL(kern_path_mountpoint); - int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); diff --git a/fs/namespace.c b/fs/namespace.c index 743980380a8f..a28e4db075ed 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1669,7 +1669,7 @@ int ksys_umount(char __user *name, int flags) struct path path; struct mount *mnt; int retval; - int lookup_flags = 0; + int lookup_flags = LOOKUP_MOUNTPOINT; if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; @@ -1680,7 +1680,7 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); diff --git a/include/linux/namei.h b/include/linux/namei.h index 0dd980d7318f..d9576a051808 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -23,6 +23,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */ #define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */ #define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */ +#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */ #define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */ #define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */ @@ -64,7 +65,6 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); -extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int); extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); From a9dc1494a78291bba3bfd427d90024659c942803 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2020 13:54:30 -0500 Subject: [PATCH 18/69] expand the only remaining call of path_lookup_conditional() Signed-off-by: Al Viro --- fs/namei.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0fcd738b95b9..e3393511d426 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -858,13 +858,6 @@ static int set_root(struct nameidata *nd) return 0; } -static void path_put_conditional(struct path *path, struct nameidata *nd) -{ - dput(path->dentry); - if (path->mnt != nd->path.mnt) - mntput(path->mnt); -} - static inline void path_to_nameidata(const struct path *path, struct nameidata *nd) { @@ -1312,8 +1305,11 @@ static int follow_managed(struct path *path, struct nameidata *nd) ret = 1; if (ret > 0 && unlikely(d_flags_negative(flags))) ret = -ENOENT; - if (unlikely(ret < 0)) - path_put_conditional(path, nd); + if (unlikely(ret < 0)) { + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); + } return ret; } From 43679723d27f5173a01fc343fb8bf61be5ae04ed Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 09:55:16 -0500 Subject: [PATCH 19/69] merging pick_link() with get_link(), part 1 Move restoring LOOKUP_PARENT and zeroing nd->stack.name[0] past the call of get_link() (nothing _currently_ uses them in there). That allows to moved the call of may_follow_link() into get_link() as well, since now the presence of LOOKUP_PARENT distinguishes the callers from each other (link_path_walk() has it, trailing_symlink() doesn't). Preparations for folding trailing_symlink() into callers (lookup_last() and do_last()) and changing the calling conventions of those. Next stage after that will have get_link() call migrate into walk_component(), then - into step_into(). It's tricky enough to warrant doing that in stages, unfortunately... Signed-off-by: Al Viro --- fs/namei.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e3393511d426..d39be45f91bc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1115,6 +1115,12 @@ const char *get_link(struct nameidata *nd) int error; const char *res; + if (!(nd->flags & LOOKUP_PARENT)) { + error = may_follow_link(nd); + if (unlikely(error)) + return ERR_PTR(error); + } + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) return ERR_PTR(-ELOOP); @@ -2329,13 +2335,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags) static const char *trailing_symlink(struct nameidata *nd) { - const char *s; - int error = may_follow_link(nd); - if (unlikely(error)) - return ERR_PTR(error); + const char *s = get_link(nd); nd->flags |= LOOKUP_PARENT; nd->stack[0].name = NULL; - s = get_link(nd); return s ? s : ""; } From 1ccac622f9da58a113f9c5b8c9d07d76b60bc555 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 10:13:40 -0500 Subject: [PATCH 20/69] merging pick_link() with get_link(), part 2 Fold trailing_symlink() into lookup_last() and do_last(), change the calling conventions of those two. Rules change: success, we are done => NULL instead of 0 error => ERR_PTR(-E...) instead of -E... got a symlink to follow => return the path to be followed instead of 1 The loops calling those (in path_lookupat() and path_openat()) adjusted. A subtle change of control flow here: originally a pure-jump trailing symlink ("/" or procfs one) would've passed through the upper level loop once more, with "" for path to traverse. That would've brought us back to the lookup_last/do_last entry and we would've hit LAST_BIND case (LAST_BIND left from get_link() called by trailing_symlink()) and pretty much skip to the point right after where we'd left the sucker back when we picked that trailing symlink. Now we don't bother with that extra pass through the upper level loop - if get_link() says "I've just done a pure jump, nothing else to do", we just treat that as non-symlink case. Boilerplate added on that step will go away shortly - it'll migrate into walk_component() and then to step_into(), collapsing into the change of calling conventions for those. Signed-off-by: Al Viro --- fs/namei.c | 68 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d39be45f91bc..9678bb2179e6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2333,21 +2333,26 @@ static const char *path_init(struct nameidata *nd, unsigned flags) return s; } -static const char *trailing_symlink(struct nameidata *nd) -{ - const char *s = get_link(nd); - nd->flags |= LOOKUP_PARENT; - nd->stack[0].name = NULL; - return s ? s : ""; -} - -static inline int lookup_last(struct nameidata *nd) +static inline const char *lookup_last(struct nameidata *nd) { + int err; if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, 0); + err = walk_component(nd, 0); + if (unlikely(err)) { + const char *s; + if (err < 0) + return PTR_ERR(err); + s = get_link(nd); + if (s) { + nd->flags |= LOOKUP_PARENT; + nd->stack[0].name = NULL; + return s; + } + } + return NULL; } static int handle_lookup_down(struct nameidata *nd) @@ -2370,10 +2375,9 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path s = ERR_PTR(err); } - while (!(err = link_path_walk(s, nd)) - && ((err = lookup_last(nd)) > 0)) { - s = trailing_symlink(nd); - } + while (!(err = link_path_walk(s, nd)) && + (s = lookup_last(nd)) != NULL) + ; if (!err) err = complete_walk(nd); @@ -3183,7 +3187,7 @@ out_dput: /* * Handle the last step of open() */ -static int do_last(struct nameidata *nd, +static const char *do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; @@ -3206,7 +3210,7 @@ static int do_last(struct nameidata *nd, put_link(nd); error = handle_dots(nd, nd->last_type); if (unlikely(error)) - return error; + return ERR_PTR(error); goto finish_open; } @@ -3216,7 +3220,7 @@ static int do_last(struct nameidata *nd, /* we _can_ be in RCU mode here */ dentry = lookup_fast(nd, &inode, &seq); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; @@ -3231,12 +3235,12 @@ static int do_last(struct nameidata *nd, */ error = complete_walk(nd); if (error) - return error; + return ERR_PTR(error); audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) - return -EISDIR; + return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { @@ -3298,18 +3302,28 @@ finish_lookup: if (nd->depth) put_link(nd); error = step_into(nd, 0, dentry, inode, seq); - if (unlikely(error)) - return error; + if (unlikely(error)) { + const char *s; + if (error < 0) + return ERR_PTR(error); + s = get_link(nd); + if (s) { + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + nd->stack[0].name = NULL; + return s; + } + } if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { audit_inode(nd->name, nd->path.dentry, 0); - return -EEXIST; + return ERR_PTR(-EEXIST); } finish_open: /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) - return error; + return ERR_PTR(error); audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { error = -EISDIR; @@ -3351,7 +3365,7 @@ out: } if (got_write) mnt_drop_write(nd->path.mnt); - return error; + return ERR_PTR(error); } struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) @@ -3454,10 +3468,8 @@ static struct file *path_openat(struct nameidata *nd, } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && - (error = do_last(nd, file, op)) > 0) { - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - s = trailing_symlink(nd); - } + (s = do_last(nd, file, op)) != NULL) + ; terminate_walk(nd); } if (likely(!error)) { From 40fcf5a931af901198fcfb23a50354e54e1fa7a6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 13:07:57 -0500 Subject: [PATCH 21/69] merging pick_link() with get_link(), part 3 After a pure jump ("/" or procfs-style symlink) we don't need to hold the link anymore. link_path_walk() dropped it if such case had been detected, lookup_last/do_last() (i.e. old trailing_symlink()) left it on the stack - it ended up calling terminate_walk() shortly anyway, which would've purged the entire stack. Do it in get_link() itself instead. Simpler logics that way... Signed-off-by: Al Viro --- fs/namei.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9678bb2179e6..f844378ce908 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1154,7 +1154,9 @@ const char *get_link(struct nameidata *nd) } else { res = get(dentry, inode, &last->done); } - if (IS_ERR_OR_NULL(res)) + if (!res) + goto all_done; + if (IS_ERR(res)) return res; } if (*res == '/') { @@ -1164,9 +1166,11 @@ const char *get_link(struct nameidata *nd) while (unlikely(*++res == '/')) ; } - if (!*res) - res = NULL; - return res; + if (*res) + return res; +all_done: // pure jump + put_link(nd); + return NULL; } /* @@ -2211,11 +2215,7 @@ OK: if (IS_ERR(s)) return PTR_ERR(s); - err = 0; - if (unlikely(!s)) { - /* jumped */ - put_link(nd); - } else { + if (likely(s)) { nd->stack[nd->depth - 1].name = name; name = s; continue; From 92d270165cffb57a91e036bea2f4d6f2f8522cf1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 13:24:17 -0500 Subject: [PATCH 22/69] merging pick_link() with get_link(), part 4 Move the call of get_link() into walk_component(). Change the calling conventions for walk_component() to returning the link body to follow (if any). Signed-off-by: Al Viro --- fs/namei.c | 59 ++++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f844378ce908..c8a933c2f14a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1866,7 +1866,7 @@ static int step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, seq); } -static int walk_component(struct nameidata *nd, int flags) +static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; struct inode *inode; @@ -1881,19 +1881,24 @@ static int walk_component(struct nameidata *nd, int flags) if (!(flags & WALK_MORE) && nd->depth) put_link(nd); err = handle_dots(nd, nd->last_type); - return err; + return ERR_PTR(err); } dentry = lookup_fast(nd, &inode, &seq); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return step_into(nd, flags, dentry, inode, seq); + err = step_into(nd, flags, dentry, inode, seq); + if (!err) + return NULL; + if (err > 0) + return get_link(nd); + return ERR_PTR(err); } /* @@ -2145,6 +2150,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { + const char *link; u64 hash_len; int type; @@ -2202,24 +2208,18 @@ OK: if (!name) return 0; /* last component of nested symlink */ - err = walk_component(nd, WALK_FOLLOW); + link = walk_component(nd, WALK_FOLLOW); } else { /* not the last component */ - err = walk_component(nd, WALK_FOLLOW | WALK_MORE); + link = walk_component(nd, WALK_FOLLOW | WALK_MORE); } - if (err < 0) - return err; - - if (err) { - const char *s = get_link(nd); - - if (IS_ERR(s)) - return PTR_ERR(s); - if (likely(s)) { - nd->stack[nd->depth - 1].name = name; - name = s; - continue; - } + if (unlikely(link)) { + if (IS_ERR(link)) + return PTR_ERR(link); + /* a symlink to follow */ + nd->stack[nd->depth - 1].name = name; + name = link; + continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { @@ -2335,24 +2335,17 @@ static const char *path_init(struct nameidata *nd, unsigned flags) static inline const char *lookup_last(struct nameidata *nd) { - int err; + const char *link; if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - err = walk_component(nd, 0); - if (unlikely(err)) { - const char *s; - if (err < 0) - return PTR_ERR(err); - s = get_link(nd); - if (s) { - nd->flags |= LOOKUP_PARENT; - nd->stack[0].name = NULL; - return s; - } + link = walk_component(nd, 0); + if (link) { + nd->flags |= LOOKUP_PARENT; + nd->stack[0].name = NULL; } - return NULL; + return link; } static int handle_lookup_down(struct nameidata *nd) From b0417d2c729891eb2566d58a6d63b61a1ff38141 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 13:34:20 -0500 Subject: [PATCH 23/69] merging pick_link() with get_link(), part 5 move get_link() call into step_into(). Signed-off-by: Al Viro --- fs/namei.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c8a933c2f14a..2c8a814f3193 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1841,14 +1841,14 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static int step_into(struct nameidata *nd, int flags, +static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry, struct inode *inode, unsigned seq) { struct path path; int err = handle_mounts(nd, dentry, &path, &inode, &seq); if (err < 0) - return err; + return ERR_PTR(err); if (likely(!d_is_symlink(path.dentry)) || !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { @@ -1856,14 +1856,17 @@ static int step_into(struct nameidata *nd, int flags, path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; - return 0; + return NULL; } /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, seq)) - return -ECHILD; + return ERR_PTR(-ECHILD); } - return pick_link(nd, &path, inode, seq); + err = pick_link(nd, &path, inode, seq); + if (err > 0) + return get_link(nd); + return ERR_PTR(err); } static const char *walk_component(struct nameidata *nd, int flags) @@ -1893,12 +1896,7 @@ static const char *walk_component(struct nameidata *nd, int flags) } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - err = step_into(nd, flags, dentry, inode, seq); - if (!err) - return NULL; - if (err > 0) - return get_link(nd); - return ERR_PTR(err); + return step_into(nd, flags, dentry, inode, seq); } /* @@ -2352,8 +2350,8 @@ static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); - return step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -3193,6 +3191,7 @@ static const char *do_last(struct nameidata *nd, unsigned seq; struct inode *inode; struct dentry *dentry; + const char *res; int error; nd->flags &= ~LOOKUP_PARENT; @@ -3294,18 +3293,12 @@ static const char *do_last(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - error = step_into(nd, 0, dentry, inode, seq); - if (unlikely(error)) { - const char *s; - if (error < 0) - return ERR_PTR(error); - s = get_link(nd); - if (s) { - nd->flags |= LOOKUP_PARENT; - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - nd->stack[0].name = NULL; - return s; - } + res = step_into(nd, 0, dentry, inode, seq); + if (unlikely(res)) { + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + nd->stack[0].name = NULL; + return res; } if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { From 06708adb99e88d02556fb546b5c70ff8566bfce9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 14:26:57 -0500 Subject: [PATCH 24/69] merging pick_link() with get_link(), part 6 move the only remaining call of get_link() into pick_link() Signed-off-by: Al Viro --- fs/namei.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2c8a814f3193..f5959f65a70e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1793,14 +1793,14 @@ static inline int handle_dots(struct nameidata *nd, int type) return 0; } -static int pick_link(struct nameidata *nd, struct path *link, +static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, unsigned seq) { int error; struct saved *last; if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { path_to_nameidata(link, nd); - return -ELOOP; + return ERR_PTR(-ELOOP); } if (!(nd->flags & LOOKUP_RCU)) { if (link->mnt == nd->path.mnt) @@ -1821,7 +1821,7 @@ static int pick_link(struct nameidata *nd, struct path *link, } if (error) { path_put(link); - return error; + return ERR_PTR(error); } } @@ -1830,7 +1830,7 @@ static int pick_link(struct nameidata *nd, struct path *link, clear_delayed_call(&last->done); nd->link_inode = inode; last->seq = seq; - return 1; + return get_link(nd); } enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; @@ -1863,10 +1863,7 @@ static const char *step_into(struct nameidata *nd, int flags, if (read_seqcount_retry(&path.dentry->d_seq, seq)) return ERR_PTR(-ECHILD); } - err = pick_link(nd, &path, inode, seq); - if (err > 0) - return get_link(nd); - return ERR_PTR(err); + return pick_link(nd, &path, inode, seq); } static const char *walk_component(struct nameidata *nd, int flags) From ad6cc4c338f4e5124c29971ea749073a697063a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 14 Jan 2020 14:41:39 -0500 Subject: [PATCH 25/69] finally fold get_link() into pick_link() kill nd->link_inode, while we are at it Signed-off-by: Al Viro --- fs/namei.c | 135 ++++++++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 74 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f5959f65a70e..02f148124831 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -503,7 +503,6 @@ struct nameidata { } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; - struct inode *link_inode; unsigned root_seq; int dfd; } __randomize_layout; @@ -962,9 +961,8 @@ int sysctl_protected_regular __read_mostly; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - const struct inode *inode; const struct inode *parent; kuid_t puid; @@ -972,7 +970,6 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if owner and follower match. */ - inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; @@ -1106,73 +1103,6 @@ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, return 0; } -static __always_inline -const char *get_link(struct nameidata *nd) -{ - struct saved *last = nd->stack + nd->depth - 1; - struct dentry *dentry = last->link.dentry; - struct inode *inode = nd->link_inode; - int error; - const char *res; - - if (!(nd->flags & LOOKUP_PARENT)) { - error = may_follow_link(nd); - if (unlikely(error)) - return ERR_PTR(error); - } - - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) - return ERR_PTR(-ELOOP); - - if (!(nd->flags & LOOKUP_RCU)) { - touch_atime(&last->link); - cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - touch_atime(&last->link); - } - - error = security_inode_follow_link(dentry, inode, - nd->flags & LOOKUP_RCU); - if (unlikely(error)) - return ERR_PTR(error); - - nd->last_type = LAST_BIND; - res = READ_ONCE(inode->i_link); - if (!res) { - const char * (*get)(struct dentry *, struct inode *, - struct delayed_call *); - get = inode->i_op->get_link; - if (nd->flags & LOOKUP_RCU) { - res = get(NULL, inode, &last->done); - if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - res = get(dentry, inode, &last->done); - } - } else { - res = get(dentry, inode, &last->done); - } - if (!res) - goto all_done; - if (IS_ERR(res)) - return res; - } - if (*res == '/') { - error = nd_jump_root(nd); - if (unlikely(error)) - return ERR_PTR(error); - while (unlikely(*++res == '/')) - ; - } - if (*res) - return res; -all_done: // pure jump - put_link(nd); - return NULL; -} - /* * follow_up - Find the mountpoint of path's vfsmount * @@ -1796,8 +1726,10 @@ static inline int handle_dots(struct nameidata *nd, int type) static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, unsigned seq) { - int error; struct saved *last; + const char *res; + int error; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { path_to_nameidata(link, nd); return ERR_PTR(-ELOOP); @@ -1828,9 +1760,64 @@ static const char *pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - nd->link_inode = inode; last->seq = seq; - return get_link(nd); + + if (!(nd->flags & LOOKUP_PARENT)) { + error = may_follow_link(nd, inode); + if (unlikely(error)) + return ERR_PTR(error); + } + + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + return ERR_PTR(-ELOOP); + + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } + + error = security_inode_follow_link(link->dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); + + nd->last_type = LAST_BIND; + res = READ_ONCE(inode->i_link); + if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; + if (nd->flags & LOOKUP_RCU) { + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + res = get(link->dentry, inode, &last->done); + } + } else { + res = get(link->dentry, inode, &last->done); + } + if (!res) + goto all_done; + if (IS_ERR(res)) + return res; + } + if (*res == '/') { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + while (unlikely(*++res == '/')) + ; + } + if (*res) + return res; +all_done: // pure jump + put_link(nd); + return NULL; } enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; From b4c0353693d22f9dcfa4757262dab0aab8376d19 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jan 2020 11:44:51 -0500 Subject: [PATCH 26/69] sanitize handling of nd->last_type, kill LAST_BIND ->last_type values are set in 3 places: path_init() (sets to LAST_ROOT), link_path_walk (LAST_NORM/DOT/DOTDOT) and pick_link (LAST_BIND). The are checked in walk_component(), lookup_last() and do_last(). They also get copied to the caller by filename_parentat(). In the last 3 cases the value is what we had at the return from link_path_walk(). In case of walk_component() it's either directly downstream from assignment in link_path_walk() or, when called by lookup_last(), the value we have at the return from link_path_walk(). The value at the entry into link_path_walk() can survive to return only if the pathname contains nothing but slashes. Note that pick_link() never returns such - pure jumps are handled directly. So for the calls of link_path_walk() for trailing symlinks it does not matter what value had been there at the entry; the value at the return won't depend upon it. There are 3 call chains that might have pick_link() storing LAST_BIND: 1) pick_link() from step_into() from walk_component() from link_path_walk(). In that case we will either be parsing the next component immediately after return into link_path_walk(), which will overwrite the ->last_type before anyone has a chance to look at it, or we'll fail, in which case nobody will be looking at ->last_type at all. 2) pick_link() from step_into() from walk_component() from lookup_last(). The value is never looked at due to the above; it won't affect the value seen at return from any link_path_walk(). 3) pick_link() from step_into() from do_last(). Ditto. In other words, assignemnt in pick_link() is pointless, and so is LAST_BIND itself; nothing ever looks at that value. Kill it off. And make link_path_walk() _always_ assign ->last_type - in the only case when the value at the entry might survive to the return that value is always LAST_ROOT, inherited from path_init(). Move that assignment from path_init() into the beginning of link_path_walk(), to consolidate the things. Historical note: LAST_BIND used to be used for the kludge with trailing pure jump symlinks (extra iteration through the top-level loop). No point keeping it anymore... Signed-off-by: Al Viro --- Documentation/filesystems/path-lookup.rst | 7 ++----- fs/namei.c | 3 +-- include/linux/namei.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst index a3216979298b..f46b05e9b96c 100644 --- a/Documentation/filesystems/path-lookup.rst +++ b/Documentation/filesystems/path-lookup.rst @@ -404,11 +404,8 @@ that is the "next" component in the pathname. ``int last_type`` ~~~~~~~~~~~~~~~~~ -This is one of ``LAST_NORM``, ``LAST_ROOT``, ``LAST_DOT``, ``LAST_DOTDOT``, or -``LAST_BIND``. The ``last`` field is only valid if the type is -``LAST_NORM``. ``LAST_BIND`` is used when following a symlink and no -components of the symlink have been processed yet. Others should be -fairly self-explanatory. +This is one of ``LAST_NORM``, ``LAST_ROOT``, ``LAST_DOT`` or ``LAST_DOTDOT``. +The ``last`` field is only valid if the type is ``LAST_NORM``. ``struct path root`` ~~~~~~~~~~~~~~~~~~~~ diff --git a/fs/namei.c b/fs/namei.c index 02f148124831..1a83641e95e6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1785,7 +1785,6 @@ static const char *pick_link(struct nameidata *nd, struct path *link, if (unlikely(error)) return ERR_PTR(error); - nd->last_type = LAST_BIND; res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, @@ -2123,6 +2122,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) { int err; + nd->last_type = LAST_ROOT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') @@ -2224,7 +2224,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; diff --git a/include/linux/namei.h b/include/linux/namei.h index d9576a051808..a4bb992623c4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -15,7 +15,7 @@ enum { MAX_NESTED_LINKS = 8 }; /* * Type of the last component on LOOKUP_PARENT */ -enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; +enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; /* pathwalk mode */ #define LOOKUP_FOLLOW 0x0001 /* follow links at the end */ From 8c4efe22e7c4de1d44f624120a979e31e3a584b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jan 2020 12:44:18 -0500 Subject: [PATCH 27/69] namei: invert the meaning of WALK_FOLLOW old flags & WALK_FOLLOW <=> new !(flags & WALK_TRAILING) That's what that flag had really been used for. Signed-off-by: Al Viro --- fs/namei.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1a83641e95e6..5d25a3874a5b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1819,7 +1819,7 @@ all_done: // pure jump return NULL; } -enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; +enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; /* * Do we need to follow links? We _really_ want to be able @@ -1836,7 +1836,7 @@ static const char *step_into(struct nameidata *nd, int flags, if (err < 0) return ERR_PTR(err); if (likely(!d_is_symlink(path.dentry)) || - !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) || + ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ path_to_nameidata(&path, nd); @@ -2190,10 +2190,10 @@ OK: if (!name) return 0; /* last component of nested symlink */ - link = walk_component(nd, WALK_FOLLOW); + link = walk_component(nd, 0); } else { /* not the last component */ - link = walk_component(nd, WALK_FOLLOW | WALK_MORE); + link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) @@ -2321,7 +2321,7 @@ static inline const char *lookup_last(struct nameidata *nd) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - link = walk_component(nd, 0); + link = walk_component(nd, WALK_TRAILING); if (link) { nd->flags |= LOOKUP_PARENT; nd->stack[0].name = NULL; @@ -3276,7 +3276,7 @@ static const char *do_last(struct nameidata *nd, finish_lookup: if (nd->depth) put_link(nd); - res = step_into(nd, 0, dentry, inode, seq); + res = step_into(nd, WALK_TRAILING, dentry, inode, seq); if (unlikely(res)) { nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); From b1a819724074f96bfd41c8f7825c3974b91fe518 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jan 2020 12:48:44 -0500 Subject: [PATCH 28/69] pick_link(): check for WALK_TRAILING, not LOOKUP_PARENT Signed-off-by: Al Viro --- fs/namei.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 5d25a3874a5b..ff028f12cb95 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1723,8 +1723,10 @@ static inline int handle_dots(struct nameidata *nd, int type) return 0; } +enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; + static const char *pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq) + struct inode *inode, unsigned seq, int flags) { struct saved *last; const char *res; @@ -1762,7 +1764,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, clear_delayed_call(&last->done); last->seq = seq; - if (!(nd->flags & LOOKUP_PARENT)) { + if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); @@ -1819,8 +1821,6 @@ all_done: // pure jump return NULL; } -enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; - /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, @@ -1849,7 +1849,7 @@ static const char *step_into(struct nameidata *nd, int flags, if (read_seqcount_retry(&path.dentry->d_seq, seq)) return ERR_PTR(-ECHILD); } - return pick_link(nd, &path, inode, seq); + return pick_link(nd, &path, inode, seq, flags); } static const char *walk_component(struct nameidata *nd, int flags) From d8d4611a4f2dd6be8fe3ddb5cf8eea7f1dfe0787 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Feb 2020 22:04:15 -0500 Subject: [PATCH 29/69] link_path_walk(): simplify stack handling We use nd->stack to store two things: pinning down the symlinks we are resolving and resuming the name traversal when a nested symlink is finished. Currently, nd->depth is used to keep track of both. It's 0 when we call link_path_walk() for the first time (for the pathname itself) and 1 on all subsequent calls (for trailing symlinks, if any). That's fine, as far as pinning symlinks goes - when handling a trailing symlink, the string we are interpreting is the body of symlink pinned down in nd->stack[0]. It's rather inconvenient with respect to handling nested symlinks, though - when we run out of a string we are currently interpreting, we need to decide whether it's a nested symlink (in which case we need to pick the string saved back when we started to interpret that nested symlink and resume its traversal) or not (in which case we are done with link_path_walk()). Current solution is a bit of a kludge - in handling of trailing symlink (in lookup_last() and open_last_lookups() we clear nd->stack[0].name. That allows link_path_walk() to use the following rules when running out of a string to interpret: * if nd->depth is zero, we are at the end of pathname itself. * if nd->depth is positive, check the saved string; for nested symlink it will be non-NULL, for trailing symlink - NULL. It works, but it's rather non-obvious. Note that we have two sets: the set of symlinks currently being traversed and the set of postponed pathname tails. The former is stored in nd->stack[0..nd->depth-1].link and it's valid throught the pathname resolution; the latter is valid only during an individual call of link_path_walk() and it occupies nd->stack[0..nd->depth-1].name for the first call of link_path_walk() and nd->stack[1..nd->depth-1].name for subsequent ones. The kludge is basically a way to recognize the second set becoming empty. The things get simpler if we keep track of the second set's size explicitly and always store it in nd->stack[0..depth-1].name. We access the second set only inside link_path_walk(), so its size can live in a local variable; that way the check becomes trivial without the need of that kludge. Signed-off-by: Al Viro --- fs/namei.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ff028f12cb95..04c1d798013f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2120,6 +2120,7 @@ static inline u64 hash_name(const void *salt, const char *name) */ static int link_path_walk(const char *name, struct nameidata *nd) { + int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; @@ -2182,14 +2183,11 @@ static int link_path_walk(const char *name, struct nameidata *nd) } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: - /* pathname body, done */ - if (!nd->depth) - return 0; - name = nd->stack[nd->depth - 1].name; - /* trailing symlink, done */ - if (!name) + /* pathname or trailing symlink, done */ + if (!depth) return 0; /* last component of nested symlink */ + name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ @@ -2199,7 +2197,7 @@ OK: if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ - nd->stack[nd->depth - 1].name = name; + nd->stack[depth++].name = name; name = link; continue; } @@ -2324,7 +2322,6 @@ static inline const char *lookup_last(struct nameidata *nd) link = walk_component(nd, WALK_TRAILING); if (link) { nd->flags |= LOOKUP_PARENT; - nd->stack[0].name = NULL; } return link; } @@ -3280,7 +3277,6 @@ finish_lookup: if (unlikely(res)) { nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - nd->stack[0].name = NULL; return res; } From c108837e06b659fd1dd144c415a25b7ac339926b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Mar 2020 15:48:44 -0500 Subject: [PATCH 30/69] namei: have link_path_walk() maintain LOOKUP_PARENT set on entry, clear when we get to the last component. Signed-off-by: Al Viro --- fs/namei.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 04c1d798013f..b45ec86dc7b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2124,6 +2124,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) int err; nd->last_type = LAST_ROOT; + nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') @@ -2184,8 +2185,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ - if (!depth) + if (!depth) { + nd->flags &= ~LOOKUP_PARENT; return 0; + } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); @@ -2222,7 +2225,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); @@ -2314,16 +2317,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) static inline const char *lookup_last(struct nameidata *nd) { - const char *link; if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->flags &= ~LOOKUP_PARENT; - link = walk_component(nd, WALK_TRAILING); - if (link) { - nd->flags |= LOOKUP_PARENT; - } - return link; + return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) @@ -3174,7 +3171,6 @@ static const char *do_last(struct nameidata *nd, const char *res; int error; - nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { @@ -3275,7 +3271,6 @@ finish_lookup: put_link(nd); res = step_into(nd, WALK_TRAILING, dentry, inode, seq); if (unlikely(res)) { - nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } From ea936aeb3ead0551945c85ff7a7d77d9a7259664 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jan 2020 09:52:04 -0500 Subject: [PATCH 31/69] massage __follow_mount_rcu() a bit make the loop more similar to that in follow_managed(), with explicit tracking of flags, etc. Signed-off-by: Al Viro --- fs/namei.c | 70 +++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index b45ec86dc7b3..18c46b8db244 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1269,12 +1269,6 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(const struct path *path) -{ - return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - path->dentry->d_op->d_manage(path, true) : 0; -} - /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. @@ -1282,43 +1276,49 @@ static inline int managed_dentry_rcu(const struct path *path) static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { + struct dentry *dentry = path->dentry; + unsigned int flags = dentry->d_flags; + + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) + return true; + + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; + for (;;) { - struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path)) { - case -ECHILD: - default: - return false; - case -EISDIR: - return true; - case 0: - break; + if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { + int res = dentry->d_op->d_manage(path, true); + if (res) + return res == -EISDIR; + flags = dentry->d_flags; } - if (!d_mountpoint(path->dentry)) - return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); - - mounted = __lookup_mnt(path->mnt, path->dentry); - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return false; - path->mnt = &mounted->mnt; - path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; - *seqp = read_seqcount_begin(&path->dentry->d_seq); - /* - * Update the inode too. We don't need to re-check the - * dentry sequence number here after this d_inode read, - * because a mount-point is always pinned. - */ - *inode = path->dentry->d_inode; + if (flags & DCACHE_MOUNTED) { + struct mount *mounted = __lookup_mnt(path->mnt, dentry); + if (mounted) { + path->mnt = &mounted->mnt; + dentry = path->dentry = mounted->mnt.mnt_root; + nd->flags |= LOOKUP_JUMPED; + *seqp = read_seqcount_begin(&dentry->d_seq); + *inode = dentry->d_inode; + /* + * We don't need to re-check ->d_seq after this + * ->d_inode read - there will be an RCU delay + * between mount hash removal and ->mnt_root + * becoming unpinned. + */ + flags = dentry->d_flags; + continue; + } + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; + } + return !(flags & DCACHE_NEED_AUTOMOUNT); } - return !read_seqretry(&mount_lock, nd->m_seq) && - !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, From 9deed3ebca244663530782631834e706a86a8c8f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jan 2020 08:45:08 -0500 Subject: [PATCH 32/69] new helper: traverse_mounts() common guts of follow_down() and follow_managed() taken to a new helper - traverse_mounts(). The remnants of follow_managed() are folded into its sole remaining caller (handle_mounts()). Calling conventions of handle_mounts() slightly sanitized - instead of the weird "1 for success, -E... for failure" that used to be imposed by the calling conventions of walk_component() et.al. we can use the normal "0 for success, -E... for failure". Signed-off-by: Al Viro --- fs/namei.c | 177 ++++++++++++++++++++++------------------------------- 1 file changed, 72 insertions(+), 105 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 18c46b8db244..40d5f7abfa54 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1168,91 +1168,79 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags } /* - * Handle a dentry that is managed in some way. - * - Flagged for transit management (autofs) - * - Flagged as mountpoint - * - Flagged as automount point - * - * This may only be called in refwalk mode. - * On success path->dentry is known positive. - * - * Serialization is taken care of in namespace.c + * mount traversal - out-of-line part. One note on ->d_flags accesses - + * dentries are pinned but not locked here, so negative dentry can go + * positive right under us. Use of smp_load_acquire() provides a barrier + * sufficient for ->d_inode and ->d_flags consistency. */ -static int follow_managed(struct path *path, struct nameidata *nd) +static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, + int *count, unsigned lookup_flags) { - struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned flags; + struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; - /* Given that we're not holding a lock here, we retain the value in a - * local variable for each dentry as we look at it so that we don't see - * the components of that value change under us */ - while (flags = smp_load_acquire(&path->dentry->d_flags), - unlikely(flags & DCACHE_MANAGED_DENTRY)) { + while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } - /* Transit to a mounted filesystem. */ - if (flags & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); - if (mounted) { + if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); + // here we know it's positive + flags = path->dentry->d_flags; need_mntput = true; continue; } - - /* Something is mounted on this dentry in another - * namespace and/or whatever was mounted there in this - * namespace got unmounted before lookup_mnt() could - * get it */ } - /* Handle an automount point */ - if (flags & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, &nd->total_link_count, - nd->flags); - if (ret < 0) - break; - continue; - } + if (!(flags & DCACHE_NEED_AUTOMOUNT)) + break; - /* We didn't change the current path point */ - break; + // uncovered automount point + ret = follow_automount(path, count, lookup_flags); + flags = smp_load_acquire(&path->dentry->d_flags); + if (ret < 0) + break; } - if (need_mntput) { - if (path->mnt == mnt) - mntput(path->mnt); - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->flags |= LOOKUP_JUMPED; - } - if (ret == -EISDIR || !ret) - ret = 1; - if (ret > 0 && unlikely(d_flags_negative(flags))) + if (ret == -EISDIR) + ret = 0; + // possible if you race with several mount --move + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; - if (unlikely(ret < 0)) { - dput(path->dentry); - if (path->mnt != nd->path.mnt) - mntput(path->mnt); - } + *jumped = need_mntput; return ret; } +static inline int traverse_mounts(struct path *path, bool *jumped, + int *count, unsigned lookup_flags) +{ + unsigned flags = smp_load_acquire(&path->dentry->d_flags); + + /* fastpath */ + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { + *jumped = false; + if (unlikely(d_flags_negative(flags))) + return -ENOENT; + return 0; + } + return __traverse_mounts(path, flags, jumped, count, lookup_flags); +} + int follow_down_one(struct path *path) { struct vfsmount *mounted; @@ -1269,6 +1257,23 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path) +{ + struct vfsmount *mnt = path->mnt; + bool jumped; + int ret = traverse_mounts(path, &jumped, NULL, 0); + + if (path->mnt != mnt) + mntput(mnt); + return ret; +} +EXPORT_SYMBOL(follow_down); + /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. @@ -1325,6 +1330,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path, struct inode **inode, unsigned int *seqp) { + bool jumped; int ret; path->mnt = nd->path.mnt; @@ -1334,15 +1340,25 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, if (unlikely(!*inode)) return -ENOENT; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 1; + return 0; if (unlazy_child(nd, dentry, seq)) return -ECHILD; // *path might've been clobbered by __follow_mount_rcu() path->mnt = nd->path.mnt; path->dentry = dentry; } - ret = follow_managed(path, nd); - if (likely(ret >= 0)) { + ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); + if (jumped) { + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + ret = -EXDEV; + else + nd->flags |= LOOKUP_JUMPED; + } + if (unlikely(ret)) { + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); + } else { *inode = d_backing_inode(path->dentry); *seqp = 0; /* out of RCU mode, so the value doesn't matter */ } @@ -1410,55 +1426,6 @@ static int follow_dotdot_rcu(struct nameidata *nd) return 0; } -/* - * Follow down to the covering mount currently visible to userspace. At each - * point, the filesystem owning that dentry may be queried as to whether the - * caller is permitted to proceed or not. - */ -int follow_down(struct path *path) -{ - unsigned managed; - int ret; - - while (managed = READ_ONCE(path->dentry->d_flags), - unlikely(managed & DCACHE_MANAGED_DENTRY)) { - /* Allow the filesystem to manage the transit without i_mutex - * being held. - * - * We indicate to the filesystem if someone is trying to mount - * something here. This gives autofs the chance to deny anyone - * other than its daemon the right to mount on its - * superstructure. - * - * The filesystem may sleep at this point. - */ - if (managed & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path, false); - if (ret < 0) - return ret == -EISDIR ? 0 : ret; - } - - /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - continue; - } - - /* Don't handle automount points here */ - break; - } - return 0; -} -EXPORT_SYMBOL(follow_down); - /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() */ From 6fb968cdf9d0bd66742ac73f3db268dcb0278ada Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 09:53:19 -0500 Subject: [PATCH 33/69] atomic_open(): return the right dentry in FMODE_OPENED case ->atomic_open() might have used a different alias than the one we'd passed to it; in "not opened" case we take care of that, in "opened" one we don't. Currently we don't care downstream of "opened" case which alias to return; however, that will change shortly when we get to unifying may_open() calls. It's not hard to get right in all cases, anyway. Signed-off-by: Al Viro --- fs/namei.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 40d5f7abfa54..88f985aff4f8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2949,11 +2949,15 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { + int acc_mode = op->acc_mode; + if (unlikely(dentry != file->f_path.dentry)) { + dput(dentry); + dentry = dget(file->f_path.dentry); + } /* * We didn't have the inode before the open, so check open * permission here. */ - int acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { WARN_ON(!(open_flag & O_CREAT)); fsnotify_create(dir, dentry); From 7be219b4dcd9c698a120726e48a52b96f554fd2c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:02:29 -0500 Subject: [PATCH 34/69] atomic_open(): lift the call of may_open() into do_last() there we'll be able to merge it with its counterparts in other cases, and there's no reason to do it before the parent has been unlocked Signed-off-by: Al Viro --- fs/namei.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 88f985aff4f8..7e932d9a71a9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2949,23 +2949,12 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { - int acc_mode = op->acc_mode; if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } - /* - * We didn't have the inode before the open, so check open - * permission here. - */ - if (file->f_mode & FMODE_CREATED) { - WARN_ON(!(open_flag & O_CREAT)); + if (file->f_mode & FMODE_CREATED) fsnotify_create(dir, dentry); - acc_mode = 0; - } - error = may_open(&file->f_path, acc_mode, open_flag); - if (WARN_ON(error > 0)) - error = -EINVAL; } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { @@ -3208,12 +3197,19 @@ static const char *do_last(struct nameidata *nd, } if (file->f_mode & FMODE_OPENED) { - if ((file->f_mode & FMODE_CREATED) || - !S_ISREG(file_inode(file)->i_mode)) + if (file->f_mode & FMODE_CREATED) { + open_flag &= ~O_TRUNC; + will_truncate = false; + acc_mode = 0; + } else if (!S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(nd->name, file->f_path.dentry, 0); - dput(dentry); + dput(nd->path.dentry); + nd->path.dentry = dentry; + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto out; goto opened; } From 3ad5615a071f7c1c41b2e23b9ffde73b09499eb0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:06:13 -0500 Subject: [PATCH 35/69] do_last(): merge the may_open() calls have FMODE_OPENED case rejoin the main path at earlier point Signed-off-by: Al Viro --- fs/namei.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 7e932d9a71a9..2f8a5d3be784 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3207,10 +3207,7 @@ static const char *do_last(struct nameidata *nd, audit_inode(nd->name, file->f_path.dentry, 0); dput(nd->path.dentry); nd->path.dentry = dentry; - error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - goto opened; + goto finish_open_created; } if (file->f_mode & FMODE_CREATED) { @@ -3277,11 +3274,10 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - error = vfs_open(&nd->path, file); + if (!(file->f_mode & FMODE_OPENED)) + error = vfs_open(&nd->path, file); if (error) goto out; -opened: error = ima_file_check(file, op->acc_mode); if (!error && will_truncate) error = handle_truncate(file); From 59e96e65833e05a9410327627d91ccef39392a25 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:22:24 -0500 Subject: [PATCH 36/69] do_last(): don't bother with keeping got_write in FMODE_OPENED case it's easier to drop it right after lookup_open() and regain if needed (i.e. if we will need to truncate). On the non-FMODE_OPENED path we do that anyway. In case of FMODE_CREATED we won't be needing it. And it's easier to prove correctness that way, especially since the initial failure to get write access is not always fatal; proving that we'll never end up truncating in that case is rather convoluted. Signed-off-by: Al Viro --- fs/namei.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2f8a5d3be784..2eef41f505db 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3191,11 +3191,14 @@ static const char *do_last(struct nameidata *nd, else inode_unlock_shared(dir->d_inode); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; + if (got_write) { + mnt_drop_write(nd->path.mnt); + got_write = false; } + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (file->f_mode & FMODE_OPENED) { if (file->f_mode & FMODE_CREATED) { open_flag &= ~O_TRUNC; @@ -3220,16 +3223,6 @@ static const char *do_last(struct nameidata *nd, goto finish_open_created; } - /* - * If atomic_open() acquired write access it is dropped now due to - * possible mount and symlink following (this might be optimized away if - * necessary...) - */ - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; - } - finish_lookup: if (nd->depth) put_link(nd); @@ -3250,27 +3243,25 @@ finish_open: return ERR_PTR(error); audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { - error = -EISDIR; if (d_is_dir(nd->path.dentry)) - goto out; + return ERR_PTR(-EISDIR); error = may_create_in_sticky(dir_mode, dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) - goto out; + return ERR_PTR(error); } - error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) - goto out; + return ERR_PTR(-ENOTDIR); if (!d_is_reg(nd->path.dentry)) will_truncate = false; +finish_open_created: if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) - goto out; + return ERR_PTR(error); got_write = true; } -finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; From 5a2d3edd8dad8c8870419c614e3a7449effe87cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:32:22 -0500 Subject: [PATCH 37/69] do_last(): rejoing the common path earlier in FMODE_{OPENED,CREATED} case Signed-off-by: Al Viro --- fs/namei.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2eef41f505db..c85fdfd6b33d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3200,13 +3200,6 @@ static const char *do_last(struct nameidata *nd, return ERR_CAST(dentry); if (file->f_mode & FMODE_OPENED) { - if (file->f_mode & FMODE_CREATED) { - open_flag &= ~O_TRUNC; - will_truncate = false; - acc_mode = 0; - } else if (!S_ISREG(file_inode(file)->i_mode)) - will_truncate = false; - audit_inode(nd->name, file->f_path.dentry, 0); dput(nd->path.dentry); nd->path.dentry = dentry; @@ -3214,10 +3207,6 @@ static const char *do_last(struct nameidata *nd, } if (file->f_mode & FMODE_CREATED) { - /* Don't check for write permission, don't truncate */ - open_flag &= ~O_TRUNC; - will_truncate = false; - acc_mode = 0; dput(nd->path.dentry); nd->path.dentry = dentry; goto finish_open_created; @@ -3252,10 +3241,16 @@ finish_open: } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return ERR_PTR(-ENOTDIR); - if (!d_is_reg(nd->path.dentry)) - will_truncate = false; finish_open_created: + if (file->f_mode & FMODE_CREATED) { + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = false; + acc_mode = 0; + } else if (!d_is_reg(nd->path.dentry)) { + will_truncate = false; + } if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) From 8795e7d48288163ce29eb927240a0c32c40b4d54 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:38:17 -0500 Subject: [PATCH 38/69] do_last(): simplify the liveness analysis past finish_open_created Don't mess with got_write there - it is guaranteed to be false on entry and it will be set true if and only if we decide to go for truncation and manage to get write access for that. Don't carry acc_mode through the entire thing - it's only used in that part. And don't bother with gotos in there - compiler is quite capable of optimizing that. Signed-off-by: Al Viro --- fs/namei.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c85fdfd6b33d..8cdf8ef41194 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3122,9 +3122,9 @@ static const char *do_last(struct nameidata *nd, kuid_t dir_uid = nd->inode->i_uid; umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; - bool will_truncate = (open_flag & O_TRUNC) != 0; + bool do_truncate; bool got_write = false; - int acc_mode = op->acc_mode; + int acc_mode; unsigned seq; struct inode *inode; struct dentry *dentry; @@ -3243,36 +3243,30 @@ finish_open: return ERR_PTR(-ENOTDIR); finish_open_created: + do_truncate = false; + acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; - will_truncate = false; acc_mode = 0; - } else if (!d_is_reg(nd->path.dentry)) { - will_truncate = false; - } - if (will_truncate) { + } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return ERR_PTR(error); - got_write = true; + do_truncate = true; } error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - if (!(file->f_mode & FMODE_OPENED)) + if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); - if (error) - goto out; - error = ima_file_check(file, op->acc_mode); - if (!error && will_truncate) + if (!error) + error = ima_file_check(file, op->acc_mode); + if (!error && do_truncate) error = handle_truncate(file); -out: if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } - if (got_write) + if (do_truncate) mnt_drop_write(nd->path.mnt); return ERR_PTR(error); } From 973d4b73fbaf705f8fa5208234175606df0999e4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 10:48:16 -0500 Subject: [PATCH 39/69] do_last(): rejoin the common path even earlier in FMODE_{OPENED,CREATED} case ... getting may_create_in_sticky() checks in FMODE_OPENED case as well. Signed-off-by: Al Viro --- fs/namei.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8cdf8ef41194..798672577367 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3199,14 +3199,7 @@ static const char *do_last(struct nameidata *nd, if (IS_ERR(dentry)) return ERR_CAST(dentry); - if (file->f_mode & FMODE_OPENED) { - audit_inode(nd->name, file->f_path.dentry, 0); - dput(nd->path.dentry); - nd->path.dentry = dentry; - goto finish_open_created; - } - - if (file->f_mode & FMODE_CREATED) { + if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; goto finish_open_created; @@ -3230,7 +3223,9 @@ finish_open: error = complete_walk(nd); if (error) return ERR_PTR(error); - audit_inode(nd->name, nd->path.dentry, 0); +finish_open_created: + if (!(file->f_mode & FMODE_CREATED)) + audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { if (d_is_dir(nd->path.dentry)) return ERR_PTR(-EISDIR); @@ -3242,7 +3237,6 @@ finish_open: if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return ERR_PTR(-ENOTDIR); -finish_open_created: do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { From c981a4828125e4a5832a726761343cbff8810674 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Jan 2020 11:06:21 -0500 Subject: [PATCH 40/69] split the lookup-related parts of do_last() into a separate helper Signed-off-by: Al Viro --- fs/namei.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 798672577367..6530e3cbd486 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3112,19 +3112,12 @@ out_dput: return ERR_PTR(error); } -/* - * Handle the last step of open() - */ -static const char *do_last(struct nameidata *nd, +static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; - kuid_t dir_uid = nd->inode->i_uid; - umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; - bool do_truncate; bool got_write = false; - int acc_mode; unsigned seq; struct inode *inode; struct dentry *dentry; @@ -3137,9 +3130,9 @@ static const char *do_last(struct nameidata *nd, if (nd->depth) put_link(nd); error = handle_dots(nd, nd->last_type); - if (unlikely(error)) - return ERR_PTR(error); - goto finish_open; + if (likely(!error)) + error = complete_walk(nd); + return ERR_PTR(error); } if (!(open_flag & O_CREAT)) { @@ -3152,7 +3145,6 @@ static const char *do_last(struct nameidata *nd, if (likely(dentry)) goto finish_lookup; - BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ @@ -3162,7 +3154,7 @@ static const char *do_last(struct nameidata *nd, * about to look up */ error = complete_walk(nd); - if (error) + if (unlikely(error)) return ERR_PTR(error); audit_inode(nd->name, dir, AUDIT_INODE_PARENT); @@ -3191,10 +3183,8 @@ static const char *do_last(struct nameidata *nd, else inode_unlock_shared(dir->d_inode); - if (got_write) { + if (got_write) mnt_drop_write(nd->path.mnt); - got_write = false; - } if (IS_ERR(dentry)) return ERR_CAST(dentry); @@ -3202,7 +3192,7 @@ static const char *do_last(struct nameidata *nd, if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; - goto finish_open_created; + return NULL; } finish_lookup: @@ -3218,12 +3208,29 @@ finish_lookup: audit_inode(nd->name, nd->path.dentry, 0); return ERR_PTR(-EEXIST); } -finish_open: + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ - error = complete_walk(nd); - if (error) - return ERR_PTR(error); -finish_open_created: + return ERR_PTR(complete_walk(nd)); +} + +/* + * Handle the last step of open() + */ +static const char *do_last(struct nameidata *nd, + struct file *file, const struct open_flags *op) +{ + kuid_t dir_uid = nd->inode->i_uid; + umode_t dir_mode = nd->inode->i_mode; + int open_flag = op->open_flag; + bool do_truncate; + int acc_mode; + const char *link; + int error; + + link = open_last_lookups(nd, file, op); + if (unlikely(link)) + return link; + if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { From 6b03f7edf43e3b284c023ab03aba6e5e7c5a7559 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Feb 2020 15:53:19 -0500 Subject: [PATCH 41/69] path_connected(): pass mount and dentry separately eventually we'll want to do that check *before* mangling nd->path.dentry... Signed-off-by: Al Viro --- fs/namei.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6530e3cbd486..2bf9f605c46f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -550,22 +550,20 @@ static int __nd_alloc_stack(struct nameidata *nd) } /** - * path_connected - Verify that a path->dentry is below path->mnt.mnt_root - * @path: nameidate to verify + * path_connected - Verify that a dentry is below mnt.mnt_root * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ -static bool path_connected(const struct path *path) +static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { - struct vfsmount *mnt = path->mnt; struct super_block *sb = mnt->mnt_sb; /* Bind mounts and multi-root filesystems can have disconnected paths */ if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; - return is_subdir(path->dentry, mnt->mnt_root); + return is_subdir(dentry, mnt->mnt_root); } static inline int nd_alloc_stack(struct nameidata *nd) @@ -1386,7 +1384,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; - if (unlikely(!path_connected(&nd->path))) + if (unlikely(!path_connected(nd->path.mnt, parent))) return -ECHILD; break; } else { @@ -1448,7 +1446,7 @@ static int path_parent_directory(struct path *path) /* rare case of legitimate dget_parent()... */ path->dentry = dget_parent(path->dentry); dput(old); - if (unlikely(!path_connected(path))) + if (unlikely(!path_connected(path->mnt, path->dentry))) return -ENOENT; return 0; } From 63b27720a476e99d3eaceee9d2246459ae0f6286 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Feb 2020 16:01:19 -0500 Subject: [PATCH 42/69] path_parent_directory(): leave changing path->dentry to callers Instead of returning 0, return new dentry; instead of returning -ENOENT, return NULL. Adjust the callers accordingly. Signed-off-by: Al Viro --- fs/namei.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2bf9f605c46f..49b2a08105c7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1440,19 +1440,21 @@ static void follow_mount(struct path *path) } } -static int path_parent_directory(struct path *path) +static struct dentry *path_parent_directory(struct path *path) { - struct dentry *old = path->dentry; /* rare case of legitimate dget_parent()... */ - path->dentry = dget_parent(path->dentry); - dput(old); - if (unlikely(!path_connected(path->mnt, path->dentry))) - return -ENOENT; - return 0; + struct dentry *parent = dget_parent(path->dentry); + + if (unlikely(!path_connected(path->mnt, parent))) { + dput(parent); + parent = NULL; + } + return parent; } static int follow_dotdot(struct nameidata *nd) { + struct dentry *parent; while (1) { if (path_equal(&nd->path, &nd->root)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) @@ -1460,9 +1462,11 @@ static int follow_dotdot(struct nameidata *nd) break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - int ret = path_parent_directory(&nd->path); - if (ret) - return ret; + parent = path_parent_directory(&nd->path); + if (!parent) + return -ENOENT; + dput(nd->path.dentry); + nd->path.dentry = parent; break; } if (!follow_up(&nd->path)) @@ -2600,13 +2604,13 @@ int path_pts(struct path *path) */ struct dentry *child, *parent; struct qstr this; - int ret; - ret = path_parent_directory(path); - if (ret) - return ret; + parent = path_parent_directory(path); + if (!parent) + return -ENOENT; - parent = path->dentry; + dput(path->dentry); + path->dentry = parent; this.name = "pts"; this.len = 3; child = d_hash_and_lookup(parent, &this); From a6a7eb7628cfa7e0e8f62e185502facc43bd2ce3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 11 Mar 2020 13:05:03 -0400 Subject: [PATCH 43/69] expand path_parent_directory() in its callers Signed-off-by: Al Viro --- fs/namei.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 49b2a08105c7..88e5d7920540 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1440,18 +1440,6 @@ static void follow_mount(struct path *path) } } -static struct dentry *path_parent_directory(struct path *path) -{ - /* rare case of legitimate dget_parent()... */ - struct dentry *parent = dget_parent(path->dentry); - - if (unlikely(!path_connected(path->mnt, parent))) { - dput(parent); - parent = NULL; - } - return parent; -} - static int follow_dotdot(struct nameidata *nd) { struct dentry *parent; @@ -1462,9 +1450,13 @@ static int follow_dotdot(struct nameidata *nd) break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - parent = path_parent_directory(&nd->path); - if (!parent) + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); return -ENOENT; + } dput(nd->path.dentry); nd->path.dentry = parent; break; @@ -2602,13 +2594,14 @@ int path_pts(struct path *path) /* Find something mounted on "pts" in the same directory as * the input path. */ - struct dentry *child, *parent; + struct dentry *parent = dget_parent(path->dentry); + struct dentry *child; struct qstr this; - parent = path_parent_directory(path); - if (!parent) + if (unlikely(!path_connected(path->mnt, parent))) { + dput(parent); return -ENOENT; - + } dput(path->dentry); path->dentry = parent; this.name = "pts"; From abc2c632e0cec6f76883d008c298e31cb3e2d242 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 11:55:02 -0500 Subject: [PATCH 44/69] follow_dotdot{,_rcu}(): lift switching nd->path to parent out of loop Signed-off-by: Al Viro --- fs/namei.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 88e5d7920540..772d82daf3b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1365,7 +1365,9 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, static int follow_dotdot_rcu(struct nameidata *nd) { + struct dentry *parent = NULL; struct inode *inode = nd->inode; + unsigned seq; while (1) { if (path_equal(&nd->path, &nd->root)) { @@ -1375,15 +1377,12 @@ static int follow_dotdot_rcu(struct nameidata *nd) } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; - struct dentry *parent = old->d_parent; - unsigned seq; + parent = old->d_parent; inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) return -ECHILD; - nd->path.dentry = parent; - nd->seq = seq; if (unlikely(!path_connected(nd->path.mnt, parent))) return -ECHILD; break; @@ -1406,6 +1405,10 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->seq = seq; } } + if (likely(parent)) { + nd->path.dentry = parent; + nd->seq = seq; + } while (unlikely(d_mountpoint(nd->path.dentry))) { struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); @@ -1442,7 +1445,7 @@ static void follow_mount(struct path *path) static int follow_dotdot(struct nameidata *nd) { - struct dentry *parent; + struct dentry *parent = NULL; while (1) { if (path_equal(&nd->path, &nd->root)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) @@ -1452,13 +1455,10 @@ static int follow_dotdot(struct nameidata *nd) if (nd->path.dentry != nd->path.mnt->mnt_root) { /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); - if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return -ENOENT; } - dput(nd->path.dentry); - nd->path.dentry = parent; break; } if (!follow_up(&nd->path)) @@ -1466,6 +1466,10 @@ static int follow_dotdot(struct nameidata *nd) if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return -EXDEV; } + if (likely(parent)) { + dput(nd->path.dentry); + nd->path.dentry = parent; + } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; return 0; From c9a0f75d81e3ffccb885f0069d45e065acc0349a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 11:59:02 -0500 Subject: [PATCH 45/69] follow_dotdot{,_rcu}(): lift LOOKUP_BENEATH checks out of loop Behaviour change: LOOKUP_BENEATH lookup of .. in absolute root yields an error even if it's not the process' root. That's possible only if you'd managed to escape chroot jail by way of procfs symlinks, but IMO the resulting behaviour is not worse - more consistent and easier to describe: ".." in root is "stay where you are", uness LOOKUP_BENEATH has been given, in which case it's "fail with EXDEV". Signed-off-by: Al Viro --- fs/namei.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 772d82daf3b4..577dc541a4d4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1370,11 +1370,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) unsigned seq; while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -ECHILD; + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; @@ -1405,7 +1402,10 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->seq = seq; } } - if (likely(parent)) { + if (unlikely(!parent)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -ECHILD; + } else { nd->path.dentry = parent; nd->seq = seq; } @@ -1447,11 +1447,8 @@ static int follow_dotdot(struct nameidata *nd) { struct dentry *parent = NULL; while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -EXDEV; + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); @@ -1466,7 +1463,10 @@ static int follow_dotdot(struct nameidata *nd) if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return -EXDEV; } - if (likely(parent)) { + if (unlikely(!parent)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; + } else { dput(nd->path.dentry); nd->path.dentry = parent; } From 957dd41d88427f1f592d2287b179904087518021 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 01:40:04 -0500 Subject: [PATCH 46/69] move handle_dots(), follow_dotdot() and follow_dotdot_rcu() past step_into() pure move; we are going to have step_into() called by that bunch. Signed-off-by: Al Viro --- fs/namei.c | 260 ++++++++++++++++++++++++++--------------------------- 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 577dc541a4d4..3ee3b8719505 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1363,70 +1363,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return ret; } -static int follow_dotdot_rcu(struct nameidata *nd) -{ - struct dentry *parent = NULL; - struct inode *inode = nd->inode; - unsigned seq; - - while (1) { - if (path_equal(&nd->path, &nd->root)) - break; - if (nd->path.dentry != nd->path.mnt->mnt_root) { - struct dentry *old = nd->path.dentry; - - parent = old->d_parent; - inode = parent->d_inode; - seq = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) - return -ECHILD; - if (unlikely(!path_connected(nd->path.mnt, parent))) - return -ECHILD; - break; - } else { - struct mount *mnt = real_mount(nd->path.mnt); - struct mount *mparent = mnt->mnt_parent; - struct dentry *mountpoint = mnt->mnt_mountpoint; - struct inode *inode2 = mountpoint->d_inode; - unsigned seq = read_seqcount_begin(&mountpoint->d_seq); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (&mparent->mnt == nd->path.mnt) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - /* we know that mountpoint was pinned */ - nd->path.dentry = mountpoint; - nd->path.mnt = &mparent->mnt; - inode = inode2; - nd->seq = seq; - } - } - if (unlikely(!parent)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -ECHILD; - } else { - nd->path.dentry = parent; - nd->seq = seq; - } - while (unlikely(d_mountpoint(nd->path.dentry))) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } - nd->inode = inode; - return 0; -} - /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() */ @@ -1443,38 +1379,6 @@ static void follow_mount(struct path *path) } } -static int follow_dotdot(struct nameidata *nd) -{ - struct dentry *parent = NULL; - while (1) { - if (path_equal(&nd->path, &nd->root)) - break; - if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - parent = dget_parent(nd->path.dentry); - if (unlikely(!path_connected(nd->path.mnt, parent))) { - dput(parent); - return -ENOENT; - } - break; - } - if (!follow_up(&nd->path)) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -EXDEV; - } - if (unlikely(!parent)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -EXDEV; - } else { - dput(nd->path.dentry); - nd->path.dentry = parent; - } - follow_mount(&nd->path); - nd->inode = nd->path.dentry->d_inode; - return 0; -} - /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. @@ -1654,40 +1558,6 @@ static inline int may_lookup(struct nameidata *nd) return inode_permission(nd->inode, MAY_EXEC); } -static inline int handle_dots(struct nameidata *nd, int type) -{ - if (type == LAST_DOTDOT) { - int error = 0; - - if (!nd->root.mnt) { - error = set_root(nd); - if (error) - return error; - } - if (nd->flags & LOOKUP_RCU) - error = follow_dotdot_rcu(nd); - else - error = follow_dotdot(nd); - if (error) - return error; - - if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { - /* - * If there was a racing rename or mount along our - * path, then we can't be sure that ".." hasn't jumped - * above nd->root (and so userspace should retry or use - * some fallback). - */ - smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) - return -EAGAIN; - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) - return -EAGAIN; - } - } - return 0; -} - enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, @@ -1817,6 +1687,136 @@ static const char *step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, seq, flags); } +static int follow_dotdot_rcu(struct nameidata *nd) +{ + struct dentry *parent = NULL; + struct inode *inode = nd->inode; + unsigned seq; + + while (1) { + if (path_equal(&nd->path, &nd->root)) + break; + if (nd->path.dentry != nd->path.mnt->mnt_root) { + struct dentry *old = nd->path.dentry; + + parent = old->d_parent; + inode = parent->d_inode; + seq = read_seqcount_begin(&parent->d_seq); + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return -ECHILD; + if (unlikely(!path_connected(nd->path.mnt, parent))) + return -ECHILD; + break; + } else { + struct mount *mnt = real_mount(nd->path.mnt); + struct mount *mparent = mnt->mnt_parent; + struct dentry *mountpoint = mnt->mnt_mountpoint; + struct inode *inode2 = mountpoint->d_inode; + unsigned seq = read_seqcount_begin(&mountpoint->d_seq); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (&mparent->mnt == nd->path.mnt) + break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; + /* we know that mountpoint was pinned */ + nd->path.dentry = mountpoint; + nd->path.mnt = &mparent->mnt; + inode = inode2; + nd->seq = seq; + } + } + if (unlikely(!parent)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -ECHILD; + } else { + nd->path.dentry = parent; + nd->seq = seq; + } + while (unlikely(d_mountpoint(nd->path.dentry))) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (!mounted) + break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -ECHILD; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + } + nd->inode = inode; + return 0; +} + +static int follow_dotdot(struct nameidata *nd) +{ + struct dentry *parent = NULL; + while (1) { + if (path_equal(&nd->path, &nd->root)) + break; + if (nd->path.dentry != nd->path.mnt->mnt_root) { + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); + return -ENOENT; + } + break; + } + if (!follow_up(&nd->path)) + break; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return -EXDEV; + } + if (unlikely(!parent)) { + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return -EXDEV; + } else { + dput(nd->path.dentry); + nd->path.dentry = parent; + } + follow_mount(&nd->path); + nd->inode = nd->path.dentry->d_inode; + return 0; +} + +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + int error = 0; + + if (!nd->root.mnt) { + error = set_root(nd); + if (error) + return error; + } + if (nd->flags & LOOKUP_RCU) + error = follow_dotdot_rcu(nd); + else + error = follow_dotdot(nd); + if (error) + return error; + + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + return -EAGAIN; + if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + return -EAGAIN; + } + } + return 0; +} + static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; From 7521f22b3ce2fa25e6301f5c7612fe1d698424b4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 12:22:58 -0500 Subject: [PATCH 47/69] handle_dots(), follow_dotdot{,_rcu}(): preparation to switch to step_into() Right now the tail ends of follow_dotdot{,_rcu}() are pretty much the open-coded analogues of step_into(). The differences: * the lack of proper LOOKUP_NO_XDEV handling in non-RCU case (arguably a bug) * the lack of ->d_manage() handling (again, arguably a bug) Adjust the calling conventions so that on the next step with could just switch those functions to returning step_into(). Signed-off-by: Al Viro --- fs/namei.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3ee3b8719505..1749e435edc7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1687,7 +1687,7 @@ static const char *step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, seq, flags); } -static int follow_dotdot_rcu(struct nameidata *nd) +static const char *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent = NULL; struct inode *inode = nd->inode; @@ -1703,9 +1703,9 @@ static int follow_dotdot_rcu(struct nameidata *nd) inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) - return -ECHILD; + return ERR_PTR(-ECHILD); break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1714,11 +1714,11 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct inode *inode2 = mountpoint->d_inode; unsigned seq = read_seqcount_begin(&mountpoint->d_seq); if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); if (&mparent->mnt == nd->path.mnt) break; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; + return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; @@ -1728,7 +1728,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) } if (unlikely(!parent)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -ECHILD; + return ERR_PTR(-ECHILD); } else { nd->path.dentry = parent; nd->seq = seq; @@ -1737,21 +1737,21 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); if (!mounted) break; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; + return ERR_PTR(-ECHILD); nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } nd->inode = inode; - return 0; + return NULL; } -static int follow_dotdot(struct nameidata *nd) +static const char *follow_dotdot(struct nameidata *nd) { struct dentry *parent = NULL; while (1) { @@ -1762,34 +1762,34 @@ static int follow_dotdot(struct nameidata *nd) parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); - return -ENOENT; + return ERR_PTR(-ENOENT); } break; } if (!follow_up(&nd->path)) break; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -EXDEV; + return ERR_PTR(-EXDEV); } if (unlikely(!parent)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -EXDEV; + return ERR_PTR(-EXDEV); } else { dput(nd->path.dentry); nd->path.dentry = parent; } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; - return 0; + return NULL; } -static inline int handle_dots(struct nameidata *nd, int type) +static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { - int error = 0; + const char *error = NULL; if (!nd->root.mnt) { - error = set_root(nd); + error = ERR_PTR(set_root(nd)); if (error) return error; } @@ -1809,12 +1809,12 @@ static inline int handle_dots(struct nameidata *nd, int type) */ smp_rmb(); if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) - return -EAGAIN; + return ERR_PTR(-EAGAIN); if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) - return -EAGAIN; + return ERR_PTR(-EAGAIN); } } - return 0; + return NULL; } static const char *walk_component(struct nameidata *nd, int flags) @@ -1822,7 +1822,6 @@ static const char *walk_component(struct nameidata *nd, int flags) struct dentry *dentry; struct inode *inode; unsigned seq; - int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and @@ -1831,8 +1830,7 @@ static const char *walk_component(struct nameidata *nd, int flags) if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - err = handle_dots(nd, nd->last_type); - return ERR_PTR(err); + return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd, &inode, &seq); if (IS_ERR(dentry)) @@ -3128,10 +3126,10 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); - error = handle_dots(nd, nd->last_type); - if (likely(!error)) - error = complete_walk(nd); - return ERR_PTR(error); + res = handle_dots(nd, nd->last_type); + if (likely(!res)) + res = ERR_PTR(complete_walk(nd)); + return res; } if (!(open_flag & O_CREAT)) { From 6dfd9fe54dfd615eb1dcb6a1281f3b5fddef190c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 12:33:48 -0500 Subject: [PATCH 48/69] follow_dotdot{,_rcu}(): switch to use of step_into() gets the regular mount crossing on result of .. Signed-off-by: Al Viro --- fs/namei.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1749e435edc7..9c775013368a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1722,33 +1722,18 @@ static const char *follow_dotdot_rcu(struct nameidata *nd) /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; - inode = inode2; + inode = nd->inode = inode2; nd->seq = seq; } } if (unlikely(!parent)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); + return step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq); } else { - nd->path.dentry = parent; - nd->seq = seq; + return step_into(nd, WALK_NOFOLLOW, parent, inode, seq); } - while (unlikely(d_mountpoint(nd->path.dentry))) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return ERR_PTR(-ECHILD); - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return ERR_PTR(-ECHILD); - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } - nd->inode = inode; - return NULL; } static const char *follow_dotdot(struct nameidata *nd) @@ -1774,13 +1759,11 @@ static const char *follow_dotdot(struct nameidata *nd) if (unlikely(!parent)) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); + return step_into(nd, WALK_NOFOLLOW, + dget(nd->path.dentry), nd->inode, nd->seq); } else { - dput(nd->path.dentry); - nd->path.dentry = parent; + return step_into(nd, WALK_NOFOLLOW, parent, parent->d_inode, 0); } - follow_mount(&nd->path); - nd->inode = nd->path.dentry->d_inode; - return NULL; } static const char *handle_dots(struct nameidata *nd, int type) From c2df196876609e00d90909f51b7a3392dc8e377c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 14:33:30 -0500 Subject: [PATCH 49/69] lift all calls of step_into() out of follow_dotdot/follow_dotdot_rcu lift step_into() into handle_dots() (where they merge with each other); have follow_... return dentry and pass inode/seq to the caller. [braino fix folded; kudos to Qian Cai for reporting it] Signed-off-by: Al Viro --- fs/namei.c | 71 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9c775013368a..fda239b9baff 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1687,31 +1687,29 @@ static const char *step_into(struct nameidata *nd, int flags, return pick_link(nd, &path, inode, seq, flags); } -static const char *follow_dotdot_rcu(struct nameidata *nd) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) { - struct dentry *parent = NULL; - struct inode *inode = nd->inode; - unsigned seq; - while (1) { if (path_equal(&nd->path, &nd->root)) break; if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; + struct dentry *parent = old->d_parent; - parent = old->d_parent; - inode = parent->d_inode; - seq = read_seqcount_begin(&parent->d_seq); + *inodep = parent->d_inode; + *seqp = read_seqcount_begin(&parent->d_seq); if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); - break; + return parent; } else { struct mount *mnt = real_mount(nd->path.mnt); struct mount *mparent = mnt->mnt_parent; struct dentry *mountpoint = mnt->mnt_mountpoint; - struct inode *inode2 = mountpoint->d_inode; + struct inode *inode = mountpoint->d_inode; unsigned seq = read_seqcount_begin(&mountpoint->d_seq); if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) return ERR_PTR(-ECHILD); @@ -1722,54 +1720,51 @@ static const char *follow_dotdot_rcu(struct nameidata *nd) /* we know that mountpoint was pinned */ nd->path.dentry = mountpoint; nd->path.mnt = &mparent->mnt; - inode = nd->inode = inode2; + nd->inode = inode; nd->seq = seq; } } - if (unlikely(!parent)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return ERR_PTR(-ECHILD); - return step_into(nd, WALK_NOFOLLOW, - nd->path.dentry, nd->inode, nd->seq); - } else { - return step_into(nd, WALK_NOFOLLOW, parent, inode, seq); - } + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-ECHILD); + return NULL; } -static const char *follow_dotdot(struct nameidata *nd) +static struct dentry *follow_dotdot(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) { - struct dentry *parent = NULL; while (1) { if (path_equal(&nd->path, &nd->root)) break; if (nd->path.dentry != nd->path.mnt->mnt_root) { /* rare case of legitimate dget_parent()... */ - parent = dget_parent(nd->path.dentry); + struct dentry *parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } - break; + *seqp = 0; + *inodep = parent->d_inode; + return parent; } if (!follow_up(&nd->path)) break; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } - if (unlikely(!parent)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return ERR_PTR(-EXDEV); - return step_into(nd, WALK_NOFOLLOW, - dget(nd->path.dentry), nd->inode, nd->seq); - } else { - return step_into(nd, WALK_NOFOLLOW, parent, parent->d_inode, 0); - } + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-EXDEV); + dget(nd->path.dentry); + return NULL; } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; + struct dentry *parent; + struct inode *inode; + unsigned seq; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); @@ -1777,10 +1772,18 @@ static const char *handle_dots(struct nameidata *nd, int type) return error; } if (nd->flags & LOOKUP_RCU) - error = follow_dotdot_rcu(nd); + parent = follow_dotdot_rcu(nd, &inode, &seq); else - error = follow_dotdot(nd); - if (error) + parent = follow_dotdot(nd, &inode, &seq); + if (IS_ERR(parent)) + return ERR_CAST(parent); + if (unlikely(!parent)) + error = step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq); + else + error = step_into(nd, WALK_NOFOLLOW, + parent, inode, seq); + if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { From 12487f3067d8d68a90dd2fc0dbc2a287ae699236 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 14:59:56 -0500 Subject: [PATCH 50/69] follow_dotdot{,_rcu}(): massage loops The logics in both of them is the same: while true if in process' root // uncommon break if *not* in mount root // normal case find the parent return if at absolute root // very uncommon break move to underlying mountpoint report that we are in root Pull the common path out of the loop: if in process' root // uncommon goto in_root if unlikely(in mount root) while true if at absolute root goto in_root move to underlying mountpoint if in process' root goto in_root if in mount root break; find the parent // we are not in mount root return in_root: report that we are in root The reason for that transformation is that we get to keep the common path straight *and* get a separate block for "move through underlying mountpoints", which will allow to sanitize NO_XDEV handling there. What's more, the pared-down loops will be easier to deal with - in particular, non-RCU case has no need to grab mount_lock and rewriting it to the form that wouldn't do that is a non-trivial change. Better do that with less stuff getting in the way... Signed-off-by: Al Viro --- fs/namei.c | 77 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index fda239b9baff..8cfbd5fd0c78 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1691,21 +1691,12 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, struct inode **inodep, unsigned *seqp) { - while (1) { - if (path_equal(&nd->path, &nd->root)) - break; - if (nd->path.dentry != nd->path.mnt->mnt_root) { - struct dentry *old = nd->path.dentry; - struct dentry *parent = old->d_parent; + struct dentry *parent, *old; - *inodep = parent->d_inode; - *seqp = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) - return ERR_PTR(-ECHILD); - if (unlikely(!path_connected(nd->path.mnt, parent))) - return ERR_PTR(-ECHILD); - return parent; - } else { + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + while (1) { struct mount *mnt = real_mount(nd->path.mnt); struct mount *mparent = mnt->mnt_parent; struct dentry *mountpoint = mnt->mnt_mountpoint; @@ -1714,7 +1705,7 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) return ERR_PTR(-ECHILD); if (&mparent->mnt == nd->path.mnt) - break; + goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ @@ -1722,8 +1713,22 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, nd->path.mnt = &mparent->mnt; nd->inode = inode; nd->seq = seq; + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (nd->path.dentry != nd->path.mnt->mnt_root) + break; } } + old = nd->path.dentry; + parent = old->d_parent; + *inodep = parent->d_inode; + *seqp = read_seqcount_begin(&parent->d_seq); + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + if (unlikely(!path_connected(nd->path.mnt, parent))) + return ERR_PTR(-ECHILD); + return parent; +in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); return NULL; @@ -1733,25 +1738,33 @@ static struct dentry *follow_dotdot(struct nameidata *nd, struct inode **inodep, unsigned *seqp) { - while (1) { - if (path_equal(&nd->path, &nd->root)) - break; - if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - struct dentry *parent = dget_parent(nd->path.dentry); - if (unlikely(!path_connected(nd->path.mnt, parent))) { - dput(parent); - return ERR_PTR(-ENOENT); - } - *seqp = 0; - *inodep = parent->d_inode; - return parent; + struct dentry *parent; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + while (1) { + if (!follow_up(&nd->path)) + goto in_root; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-EXDEV); + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (nd->path.dentry != nd->path.mnt->mnt_root) + break; } - if (!follow_up(&nd->path)) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return ERR_PTR(-EXDEV); } + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); + return ERR_PTR(-ENOENT); + } + *seqp = 0; + *inodep = parent->d_inode; + return parent; + +in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); dget(nd->path.dentry); From efe772d6283b3672c0880b10fc4ceea69bc782ca Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 28 Feb 2020 10:06:37 -0500 Subject: [PATCH 51/69] follow_dotdot_rcu(): be lazy about changing nd->path Change nd->path only after the loop is done and only in case we hadn't ended up finding ourselves in root. Same for NO_XDEV check. Don't recheck mount_lock on each step either. That separates the "check how far back do we need to go through the mount stack" logics from the rest of .. traversal. Note that the sequence for d_seq/d_inode here is * sample mount_lock seqcount ... * sample d_seq * fetch d_inode * verify mount_lock seqcount The last step makes sure that d_inode value we'd got matches d_seq - it dentry is guaranteed to have been a mountpoint through the entire thing, so its d_inode must have been stable. Signed-off-by: Al Viro --- fs/namei.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8cfbd5fd0c78..af18ede95e2d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1696,28 +1696,31 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path = nd->path; + unsigned seq; + while (1) { - struct mount *mnt = real_mount(nd->path.mnt); + struct mount *mnt = real_mount(path.mnt); struct mount *mparent = mnt->mnt_parent; struct dentry *mountpoint = mnt->mnt_mountpoint; - struct inode *inode = mountpoint->d_inode; - unsigned seq = read_seqcount_begin(&mountpoint->d_seq); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return ERR_PTR(-ECHILD); - if (&mparent->mnt == nd->path.mnt) + seq = read_seqcount_begin(&mountpoint->d_seq); + if (&mparent->mnt == path.mnt) goto in_root; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return ERR_PTR(-ECHILD); - /* we know that mountpoint was pinned */ - nd->path.dentry = mountpoint; - nd->path.mnt = &mparent->mnt; - nd->inode = inode; - nd->seq = seq; - if (path_equal(&nd->path, &nd->root)) + path.dentry = mountpoint; + path.mnt = &mparent->mnt; + if (path_equal(&path, &nd->root)) goto in_root; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path.dentry != path.mnt->mnt_root) break; } + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-ECHILD); + nd->path = path; + nd->inode = path.dentry->d_inode; + nd->seq = seq; + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); + /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; @@ -1729,6 +1732,8 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, return ERR_PTR(-ECHILD); return parent; in_root: + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); return NULL; From 165200d6cb88a3cc1fdeb12e11e97fb96bf685d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 28 Feb 2020 10:17:52 -0500 Subject: [PATCH 52/69] follow_dotdot(): be lazy about changing nd->path Change nd->path only after the loop is done and only in case we hadn't ended up finding ourselves in root. Same for NO_XDEV check. That separates the "check how far back do we need to go through the mount stack" logics from the rest of .. traversal. NOTE: path_get/path_put introduced here are temporary. They will go away later in the series. Signed-off-by: Al Viro --- fs/namei.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index af18ede95e2d..9cfb7096e307 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1748,16 +1748,24 @@ static struct dentry *follow_dotdot(struct nameidata *nd, if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path = nd->path; + path_get(&path); while (1) { - if (!follow_up(&nd->path)) + if (!follow_up(&path)) { + path_put(&path); goto in_root; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return ERR_PTR(-EXDEV); - if (path_equal(&nd->path, &nd->root)) + } + if (path_equal(&path, &nd->root)) { + path_put(&path); goto in_root; - if (nd->path.dentry != nd->path.mnt->mnt_root) + } + if (path.dentry != nd->path.mnt->mnt_root) break; } + path_put(&nd->path); + nd->path = path; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); From 7ef482fa65513b18eed05a5c5f00413aad137810 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 17:50:13 -0500 Subject: [PATCH 53/69] helper for mount rootwards traversal The loops in follow_dotdot{_rcu()} are doing the same thing: we have a mount and we want to find out how far up the chain of mounts do we need to go. We follow the chain of mount until we find one that is not directly overmounting the root of another mount. If such a mount is found, we want the location it's mounted upon. If we run out of chain (i.e. get to a mount that is not mounted on anything else) or run into process' root, we report failure. On success, we want (in RCU case) d_seq of resulting location sampled or (in non-RCU case) references to that location acquired. This commit introduces such primitive for RCU case and switches follow_dotdot_rcu() to it; non-RCU case will be go in the next commit. Signed-off-by: Al Viro --- fs/namei.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9cfb7096e307..6529c2506491 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1134,6 +1134,26 @@ int follow_up(struct path *path) } EXPORT_SYMBOL(follow_up); +static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, + struct path *path, unsigned *seqp) +{ + while (mnt_has_parent(m)) { + struct dentry *mountpoint = m->mnt_mountpoint; + + m = m->mnt_parent; + if (unlikely(root->dentry == mountpoint && + root->mnt == &m->mnt)) + break; + if (mountpoint != m->mnt.mnt_root) { + path->mnt = &m->mnt; + path->dentry = mountpoint; + *seqp = read_seqcount_begin(&mountpoint->d_seq); + return true; + } + } + return false; +} + /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we @@ -1696,23 +1716,11 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd, if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { - struct path path = nd->path; + struct path path; unsigned seq; - - while (1) { - struct mount *mnt = real_mount(path.mnt); - struct mount *mparent = mnt->mnt_parent; - struct dentry *mountpoint = mnt->mnt_mountpoint; - seq = read_seqcount_begin(&mountpoint->d_seq); - if (&mparent->mnt == path.mnt) - goto in_root; - path.dentry = mountpoint; - path.mnt = &mparent->mnt; - if (path_equal(&path, &nd->root)) - goto in_root; - if (path.dentry != path.mnt->mnt_root) - break; - } + if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), + &nd->root, &path, &seq)) + goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; From 2aa38470853a65dc9b1b4bd0989d34cd3fc57ebd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 19:19:05 -0500 Subject: [PATCH 54/69] non-RCU analogue of the previous commit new helper: choose_mountpoint(). Wrapper around choose_mountpoint_rcu(), similar to lookup_mnt() vs. __lookup_mnt(). follow_dotdot() switched to it. Now we don't grab mount_lock exclusive anymore; note that the primitive used non-RCU mount traversals in other direction (lookup_mnt()) doesn't bother with that either - it uses mount_lock seqcount instead. Signed-off-by: Al Viro --- fs/namei.c | 56 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6529c2506491..669a0dfa3012 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -605,10 +605,9 @@ static void terminate_walk(struct nameidata *nd) } /* path_put is needed afterwards regardless of success or failure */ -static bool legitimize_path(struct nameidata *nd, - struct path *path, unsigned seq) +static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { - int res = __legitimize_mnt(path->mnt, nd->m_seq); + int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; @@ -622,6 +621,12 @@ static bool legitimize_path(struct nameidata *nd, return !read_seqcount_retry(&path->dentry->d_seq, seq); } +static inline bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + return __legitimize_path(path, nd->m_seq, seq); +} + static bool legitimize_links(struct nameidata *nd) { int i; @@ -1154,6 +1159,31 @@ static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, return false; } +static bool choose_mountpoint(struct mount *m, const struct path *root, + struct path *path) +{ + bool found; + + rcu_read_lock(); + while (1) { + unsigned seq, mseq = read_seqbegin(&mount_lock); + + found = choose_mountpoint_rcu(m, root, path, &seq); + if (unlikely(!found)) { + if (!read_seqretry(&mount_lock, mseq)) + break; + } else { + if (likely(__legitimize_path(path, seq, mseq))) + break; + rcu_read_unlock(); + path_put(path); + rcu_read_lock(); + } + } + rcu_read_unlock(); + return found; +} + /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we @@ -1756,22 +1786,14 @@ static struct dentry *follow_dotdot(struct nameidata *nd, if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { - struct path path = nd->path; - path_get(&path); - while (1) { - if (!follow_up(&path)) { - path_put(&path); - goto in_root; - } - if (path_equal(&path, &nd->root)) { - path_put(&path); - goto in_root; - } - if (path.dentry != nd->path.mnt->mnt_root) - break; - } + struct path path; + + if (!choose_mountpoint(real_mount(nd->path.mnt), + &nd->root, &path)) + goto in_root; path_put(&nd->path); nd->path = path; + nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } From 19f6028a01469276abe520c46a2c0ed0204e28d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Feb 2020 20:09:37 -0500 Subject: [PATCH 55/69] fs/namei.c: kill follow_mount() The only remaining caller (path_pts()) should be using follow_down() anyway. And clean path_pts() a bit. Signed-off-by: Al Viro --- fs/namei.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 669a0dfa3012..1f092f9e321b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1413,22 +1413,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return ret; } -/* - * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() - */ -static void follow_mount(struct path *path) -{ - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - } -} - /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. @@ -2640,7 +2624,7 @@ int path_pts(struct path *path) */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; - struct qstr this; + struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); @@ -2648,15 +2632,13 @@ int path_pts(struct path *path) } dput(path->dentry); path->dentry = parent; - this.name = "pts"; - this.len = 3; child = d_hash_and_lookup(parent, &this); if (!child) return -ENOENT; path->dentry = child; dput(parent); - follow_mount(path); + follow_down(path); return 0; } #endif From 84f0cd9e83daa9a9c3e640f6d108e7e86edf0d86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Mar 2020 10:14:30 -0500 Subject: [PATCH 56/69] pick_link(): pass it struct path already with normal refcounting rules step_into() tries to avoid grabbing and dropping mount references on the steps that do not involve crossing mountpoints (which is obviously the majority of cases). So it uses a local struct path with unusual refcounting rules - path.mnt is pinned if and only if it's not equal to nd->path.mnt. We used to have similar beasts all over the place and we had quite a few bugs crop up in their handling - it's easy to get confused when changing e.g. cleanup on failure exits (or adding a new check, etc.) Now that's mostly gone - the step_into() instance (which is what we need them for) is the only one left. It is exposed to mount traversal and it's (shortly) seen by pick_link(). Since pick_link() needs to store it in link stack, where the normal rules apply, it has to make sure that mount is pinned regardless of nd->path.mnt value. That's done on all calls of pick_link() and very early in those. Let's do that in the caller (step_into()) instead - that way the fewer places need to be aware of such struct path instances. Signed-off-by: Al Viro --- fs/namei.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1f092f9e321b..aab4eee6d56a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1602,13 +1602,10 @@ static const char *pick_link(struct nameidata *nd, struct path *link, int error; if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { - path_to_nameidata(link, nd); + if (!(nd->flags & LOOKUP_RCU)) + path_put(link); return ERR_PTR(-ELOOP); } - if (!(nd->flags & LOOKUP_RCU)) { - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - } error = nd_alloc_stack(nd); if (unlikely(error)) { if (error == -ECHILD) { @@ -1713,10 +1710,13 @@ static const char *step_into(struct nameidata *nd, int flags, nd->seq = seq; return NULL; } - /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { + /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, seq)) return ERR_PTR(-ECHILD); + } else { + if (path.mnt == nd->path.mnt) + mntget(path.mnt); } return pick_link(nd, &path, inode, seq, flags); } From c99687a03a78775f77d57fe9b07af4c8ec3dd03c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Mar 2020 10:56:17 -0500 Subject: [PATCH 57/69] fold path_to_nameidata() into its only remaining caller Signed-off-by: Al Viro --- fs/namei.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index aab4eee6d56a..0ce515e03e11 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -860,18 +860,6 @@ static int set_root(struct nameidata *nd) return 0; } -static inline void path_to_nameidata(const struct path *path, - struct nameidata *nd) -{ - if (!(nd->flags & LOOKUP_RCU)) { - dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) - mntput(nd->path.mnt); - } - nd->path.mnt = path->mnt; - nd->path.dentry = path->dentry; -} - static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) @@ -1705,7 +1693,12 @@ static const char *step_into(struct nameidata *nd, int flags, ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - path_to_nameidata(&path, nd); + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path.mnt) + mntput(nd->path.mnt); + } + nd->path = path; nd->inode = inode; nd->seq = seq; return NULL; From aef9404d8cc3a5730dea74d05e9cb2e4c2b4a511 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Mar 2020 16:46:14 -0500 Subject: [PATCH 58/69] pick_link(): more straightforward handling of allocation failures pick_link() needs to push onto stack; we start with using two-element array embedded into struct nameidata and the first time we need more than that we switch to separately allocated array. Allocation can fail, of course, and handling of that would be simple enough - we need to drop 'link' and bugger off. However, the things get more complicated in RCU mode. There we must do GFP_ATOMIC allocation. If that fails, we try to switch to non-RCU mode and repeat the allocation. To switch to non-RCU mode we need to grab references to 'link' and to everything in nameidata. The latter done by unlazy_walk(); the former - legitimize_path(). 'link' must go first - after unlazy_walk() we are out of RCU-critical period and it's too late to call legitimize_path() since the references in link->mnt and link->dentry might be pointing to freed and reused memory. So we do legitimize_path(), then unlazy_walk(). And that's where it gets too subtle: what to do if the former fails? We MUST do path_put(link) to avoid leaks. And we can't do that under rcu_read_lock(). Solution in mainline was to empty then nameidata manually, drop out of RCU mode and then do put_path(). In effect, we open-code the things eventual terminate_walk() would've done on error in RCU mode. That looks badly out of place and confusing. We could add a comment along the lines of the explanation above, but... there's a simpler solution. Call unlazy_walk() even if legitimaze_path() fails. It will take us out of RCU mode, so we'll be able to do path_put(link). Yes, it will do unnecessary work - attempt to grab references on the stuff in nameidata, only to have them dropped as soon as we return the error to upper layer and get terminate_walk() called there. So what? We are thoroughly off the fast path by that point - we had GFP_ATOMIC allocation fail, we had ->d_seq or mount_lock mismatch and we are about to try walking the same path from scratch in non-RCU mode. Which will need to do the same allocation, this time with GFP_KERNEL, so it will be able to apply memory pressure for blocking stuff. Compared to that the cost of several lockref_get_not_dead() is noise. And the logics become much easier to understand that way. Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0ce515e03e11..1f3d3981fe02 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1597,14 +1597,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link, error = nd_alloc_stack(nd); if (unlikely(error)) { if (error == -ECHILD) { - if (unlikely(!legitimize_path(nd, link, seq))) { - drop_links(nd); - nd->depth = 0; - nd->flags &= ~LOOKUP_RCU; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - rcu_read_unlock(); - } else if (likely(unlazy_walk(nd)) == 0) + // we must grab link first + bool grabbed_link = legitimize_path(nd, link, seq); + // ... and we must unlazy to be able to clean up + error = unlazy_walk(nd); + if (unlikely(!grabbed_link)) + error = -ECHILD; + if (!error) error = nd_alloc_stack(nd); } if (error) { From 49055906af51b59d50a34b18820f85cc7d085218 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Mar 2020 11:22:34 -0500 Subject: [PATCH 59/69] pick_link(): take reserving space on stack into a new helper Signed-off-by: Al Viro --- fs/namei.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1f3d3981fe02..6025406e01e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1580,6 +1580,28 @@ static inline int may_lookup(struct nameidata *nd) return inode_permission(nd->inode, MAY_EXEC); } +static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) +{ + int error; + + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) + return -ELOOP; + error = nd_alloc_stack(nd); + if (likely(!error)) + return 0; + if (error == -ECHILD) { + // we must grab link first + bool grabbed_link = legitimize_path(nd, link, seq); + // ... and we must unlazy to be able to clean up + error = unlazy_walk(nd); + if (unlikely(!grabbed_link)) + error = -ECHILD; + if (!error) + error = nd_alloc_stack(nd); + } + return error; +} + enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, @@ -1587,31 +1609,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link, { struct saved *last; const char *res; - int error; + int error = reserve_stack(nd, link, seq); - if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { + if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); - return ERR_PTR(-ELOOP); + return ERR_PTR(error); } - error = nd_alloc_stack(nd); - if (unlikely(error)) { - if (error == -ECHILD) { - // we must grab link first - bool grabbed_link = legitimize_path(nd, link, seq); - // ... and we must unlazy to be able to clean up - error = unlazy_walk(nd); - if (unlikely(!grabbed_link)) - error = -ECHILD; - if (!error) - error = nd_alloc_stack(nd); - } - if (error) { - path_put(link); - return ERR_PTR(error); - } - } - last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); From 4542576b79a680062dc344a8a6e79d34b4c91bf1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Mar 2020 11:25:31 -0500 Subject: [PATCH 60/69] reserve_stack(): switch to __nd_alloc_stack() expand the call of nd_alloc_stack() into it (and don't recheck the depth on the second call) Signed-off-by: Al Viro --- fs/namei.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6025406e01e5..0539a26dd9b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -566,15 +566,6 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) return is_subdir(dentry, mnt->mnt_root); } -static inline int nd_alloc_stack(struct nameidata *nd) -{ - if (likely(nd->depth != EMBEDDED_LEVELS)) - return 0; - if (likely(nd->stack != nd->internal)) - return 0; - return __nd_alloc_stack(nd); -} - static void drop_links(struct nameidata *nd) { int i = nd->depth; @@ -1586,7 +1577,13 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; - error = nd_alloc_stack(nd); + + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + + error = __nd_alloc_stack(nd); if (likely(!error)) return 0; if (error == -ECHILD) { @@ -1597,7 +1594,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) if (unlikely(!grabbed_link)) error = -ECHILD; if (!error) - error = nd_alloc_stack(nd); + error = __nd_alloc_stack(nd); } return error; } From 60ef60c7d7291d7cdc43661363cd05a5102e69f6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 3 Mar 2020 11:43:55 -0500 Subject: [PATCH 61/69] __nd_alloc_stack(): make it return bool ... and adjust the caller (reserve_stack()). Rename to nd_alloc_stack(), while we are at it. Signed-off-by: Al Viro --- fs/namei.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 0539a26dd9b4..7a95a3fcbf68 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -529,24 +529,17 @@ static void restore_nameidata(void) kfree(now->stack); } -static int __nd_alloc_stack(struct nameidata *nd) +static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; - if (nd->flags & LOOKUP_RCU) { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_ATOMIC); - if (unlikely(!p)) - return -ECHILD; - } else { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_KERNEL); - if (unlikely(!p)) - return -ENOMEM; - } + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), + nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); + if (unlikely(!p)) + return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; - return 0; + return true; } /** @@ -1573,8 +1566,6 @@ static inline int may_lookup(struct nameidata *nd) static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) { - int error; - if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; @@ -1582,21 +1573,21 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) return 0; if (likely(nd->stack != nd->internal)) return 0; - - error = __nd_alloc_stack(nd); - if (likely(!error)) + if (likely(nd_alloc_stack(nd))) return 0; - if (error == -ECHILD) { - // we must grab link first + + if (nd->flags & LOOKUP_RCU) { + // we need to grab link before we do unlazy. And we can't skip + // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, seq); - // ... and we must unlazy to be able to clean up - error = unlazy_walk(nd); - if (unlikely(!grabbed_link)) - error = -ECHILD; - if (!error) - error = __nd_alloc_stack(nd); + + if (unlazy_walk(nd) != 0 || !grabbed_link) + return -ECHILD; + + if (nd_alloc_stack(nd)) + return 0; } - return error; + return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; From 0f705953012a753068b20016f55dc12d1f7e17e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Mar 2020 11:34:48 -0500 Subject: [PATCH 62/69] link_path_walk(): sample parent's i_uid and i_mode for the last component Signed-off-by: Al Viro --- fs/namei.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 7a95a3fcbf68..e737d884592d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -505,6 +505,8 @@ struct nameidata { struct nameidata *saved; unsigned root_seq; int dfd; + kuid_t dir_uid; + umode_t dir_mode; } __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) @@ -938,9 +940,6 @@ int sysctl_protected_regular __read_mostly; */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - const struct inode *parent; - kuid_t puid; - if (!sysctl_protected_symlinks) return 0; @@ -949,13 +948,11 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->inode; - if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ - puid = parent->i_uid; - if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) + if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -2159,6 +2156,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { + nd->dir_uid = nd->inode->i_uid; + nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } @@ -3224,8 +3223,6 @@ finish_lookup: static const char *do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { - kuid_t dir_uid = nd->inode->i_uid; - umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3241,7 +3238,7 @@ static const char *do_last(struct nameidata *nd, if (open_flag & O_CREAT) { if (d_is_dir(nd->path.dentry)) return ERR_PTR(-EISDIR); - error = may_create_in_sticky(dir_mode, dir_uid, + error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return ERR_PTR(error); From c5971b8c6354a95c9ee7eb722928af5000bac247 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Mar 2020 11:41:29 -0500 Subject: [PATCH 63/69] take post-lookup part of do_last() out of loop now we can have open_last_lookups() directly from the loop in path_openat() - the rest of do_last() never returns a symlink to follow, so we can bloody well leave the loop first. Rename the rest of that thing from do_last() to do_open() and make it return an int. Signed-off-by: Al Viro --- fs/namei.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index e737d884592d..329ade345e62 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3220,31 +3220,26 @@ finish_lookup: /* * Handle the last step of open() */ -static const char *do_last(struct nameidata *nd, +static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { int open_flag = op->open_flag; bool do_truncate; int acc_mode; - const char *link; int error; - link = open_last_lookups(nd, file, op); - if (unlikely(link)) - return link; - if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { if (d_is_dir(nd->path.dentry)) - return ERR_PTR(-EISDIR); + return -EISDIR; error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) - return ERR_PTR(error); + return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) - return ERR_PTR(-ENOTDIR); + return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; @@ -3255,7 +3250,7 @@ static const char *do_last(struct nameidata *nd, } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) - return ERR_PTR(error); + return error; do_truncate = true; } error = may_open(&nd->path, acc_mode, open_flag); @@ -3271,7 +3266,7 @@ static const char *do_last(struct nameidata *nd, } if (do_truncate) mnt_drop_write(nd->path.mnt); - return ERR_PTR(error); + return error; } struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) @@ -3374,8 +3369,10 @@ static struct file *path_openat(struct nameidata *nd, } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && - (s = do_last(nd, file, op)) != NULL) + (s = open_last_lookups(nd, file, op)) != NULL) ; + if (!error) + error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { From f7bb959d9623ddf9fe9cc44234c210051548ea18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 5 Mar 2020 13:25:20 -0500 Subject: [PATCH 64/69] open_last_lookups(): consolidate fsnotify_create() calls Signed-off-by: Al Viro --- fs/namei.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 329ade345e62..6cb41608c47c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2957,8 +2957,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = dget(file->f_path.dentry); } - if (file->f_mode & FMODE_CREATED) - fsnotify_create(dir, dentry); } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { @@ -2966,8 +2964,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } - if (file->f_mode & FMODE_CREATED) - fsnotify_create(dir, dentry); if (unlikely(d_is_negative(dentry))) error = -ENOENT; } @@ -3103,7 +3099,6 @@ no_open: open_flag & O_EXCL); if (error) goto out_dput; - fsnotify_create(dir_inode, dentry); } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; @@ -3182,6 +3177,8 @@ static const char *open_last_lookups(struct nameidata *nd, else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); + if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) + fsnotify_create(dir->d_inode, dentry); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else From 72287417abd16c42f1d1ea8d93ee60ba08022fc1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Mar 2020 10:09:26 -0400 Subject: [PATCH 65/69] open_last_lookups(): don't abuse complete_walk() when all we want is unlazy Signed-off-by: Al Viro --- fs/namei.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6cb41608c47c..9f2d95aa2502 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3147,15 +3147,11 @@ static const char *open_last_lookups(struct nameidata *nd, BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ - /* - * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED - * has been cleared when we got to the last component we are - * about to look up - */ - error = complete_walk(nd); - if (unlikely(error)) - return ERR_PTR(error); - + if (nd->flags & LOOKUP_RCU) { + error = unlazy_walk(nd); + if (unlikely(error)) + return ERR_PTR(error); + } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) From b94e0b32c899914f045ddf05949517d64fd32499 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Mar 2020 10:13:53 -0400 Subject: [PATCH 66/69] open_last_lookups(): lift O_EXCL|O_CREAT handling into do_open() Currently path_openat() has "EEXIST on O_EXCL|O_CREAT" checks done on one of the ways out of open_last_lookups(). There are 4 cases: 1) the last component is . or ..; check is not done. 2) we had FMODE_OPENED or FMODE_CREATED set while in lookup_open(); check is not done. 3) symlink to be traversed is found; check is not done (nor should it be) 4) everything else: check done (before complete_walk(), even). In case (1) O_EXCL|O_CREAT ends up failing with -EISDIR - that's open("/tmp/.", O_CREAT|O_EXCL, 0600) Note that in the same conditions open("/tmp", O_CREAT|O_EXCL, 0600) would have yielded EEXIST. Either error is allowed, switching to -EEXIST in these cases would've been more consistent. Case (2) is more subtle; first of all, if we have FMODE_CREATED set, the object hadn't existed prior to the call. The check should not be done in such a case. The rest is problematic, though - we have FMODE_OPENED set (i.e. it went through ->atomic_open() and got successfully opened there) FMODE_CREATED is *NOT* set O_CREAT and O_EXCL are both set. Any such case is a bug - either we failed to set FMODE_CREATED when we had, in fact, created an object (no such instances in the tree) or we have opened a pre-existing file despite having had both O_CREAT and O_EXCL passed. One of those was, in fact caught (and fixed) while sorting out this mess (gfs2 on cold dcache). And in such situations we should fail with EEXIST. Note that for (1) and (4) FMODE_CREATED is not set - for (1) there's nothing in handle_dots() to set it, for (4) we'd explicitly checked that. And (1), (2) and (4) are exactly the cases when we leave the loop in the caller, with do_open() called immediately after that loop. IOW, we can move the check over there, and make it If we have O_CREAT|O_EXCL and after successful pathname resolution FMODE_CREATED is *not* set, we must have run into a preexisting file and should fail with EEXIST. Signed-off-by: Al Viro --- fs/namei.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 9f2d95aa2502..2e9205599931 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3201,11 +3201,6 @@ finish_lookup: return res; } - if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { - audit_inode(nd->name, nd->path.dentry, 0); - return ERR_PTR(-EEXIST); - } - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ return ERR_PTR(complete_walk(nd)); } @@ -3224,6 +3219,8 @@ static int do_open(struct nameidata *nd, if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { + if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) + return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, From ff326a3299660e46d4eb8876b217fee38638281a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 10 Mar 2020 10:19:24 -0400 Subject: [PATCH 67/69] open_last_lookups(): move complete_walk() into do_open() Signed-off-by: Al Viro --- fs/namei.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2e9205599931..2c901a022faf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3128,10 +3128,7 @@ static const char *open_last_lookups(struct nameidata *nd, if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); - res = handle_dots(nd, nd->last_type); - if (likely(!res)) - res = ERR_PTR(complete_walk(nd)); - return res; + return handle_dots(nd, nd->last_type); } if (!(open_flag & O_CREAT)) { @@ -3196,13 +3193,9 @@ finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry, inode, seq); - if (unlikely(res)) { + if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - return res; - } - - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ - return ERR_PTR(complete_walk(nd)); + return res; } /* @@ -3216,6 +3209,11 @@ static int do_open(struct nameidata *nd, int acc_mode; int error; + if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { + error = complete_walk(nd); + if (error) + return error; + } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { From d489cf9a3e88aafc6c9f55c36ff7bbca2c25d54a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 11 Mar 2020 08:07:53 -0400 Subject: [PATCH 68/69] atomic_open(): no need to pass struct open_flags anymore argument had been unused since 1643b43fbd052 (lookup_open(): lift the "fallback to !O_CREAT" logics from atomic_open()) back in 2016 Signed-off-by: Al Viro --- fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2c901a022faf..1607560f2932 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2933,7 +2933,6 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, - const struct open_flags *op, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; @@ -3066,7 +3065,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, } if (dir_inode->i_op->atomic_open) { - dentry = atomic_open(nd, dentry, file, op, open_flag, mode); + dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; From 99a4a90c8e9337e364136393286544e3753673c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Mar 2020 14:07:27 -0400 Subject: [PATCH 69/69] lookup_open(): don't bother with fallbacks to lookup+create We fall back to lookup+create (instead of atomic_open) in several cases: 1) we don't have write access to filesystem and O_TRUNC is present in the flags. It's not something we want ->atomic_open() to see - it just might go ahead and truncate the file. However, we can pass it the flags sans O_TRUNC - eventually do_open() will call handle_truncate() anyway. 2) we have O_CREAT | O_EXCL and we can't write to parent. That's going to be an error, of course, but we want to know _which_ error should that be - might be EEXIST (if file exists), might be EACCES or EROFS. Simply stripping O_CREAT (and checking if we see ENOENT) would suffice, if not for O_EXCL. However, we used to have ->atomic_open() fully responsible for rejecting O_CREAT | O_EXCL on existing file and just stripping O_CREAT would've disarmed those checks. With nothing downstream to catch the problem - FMODE_OPENED used to be "don't bother with EEXIST checks, ->atomic_open() has done those". Now EEXIST checks downstream are skipped only if FMODE_CREATED is set - FMODE_OPENED alone is not enough. That has eliminated the need to fall back onto lookup+create path in this case. 3) O_WRONLY or O_RDWR when we have no write access to filesystem, with nothing else objectionable. Fallback is (and had always been) pointless. IOW, we don't really need that fallback; all we need in such cases is to trim O_TRUNC and O_CREAT properly. Signed-off-by: Al Viro --- fs/namei.c | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1607560f2932..61fdb77a7d58 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2939,9 +2939,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct inode *dir = nd->path.dentry->d_inode; int error; - if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */ - open_flag &= ~O_TRUNC; - if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; @@ -3038,32 +3035,20 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ + if (unlikely(!got_write)) + open_flag &= ~O_TRUNC; if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); - if (unlikely(!got_write)) { - create_error = -EROFS; - open_flag &= ~O_CREAT; - if (open_flag & (O_EXCL | O_TRUNC)) - goto no_open; - /* No side effects, safe to clear O_CREAT */ - } else { + if (likely(got_write)) create_error = may_o_create(&nd->path, dentry, mode); - if (create_error) { - open_flag &= ~O_CREAT; - if (open_flag & O_EXCL) - goto no_open; - } - } - } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && - unlikely(!got_write)) { - /* - * No O_CREATE -> atomicity not a requirement -> fall - * back to lookup + open - */ - goto no_open; + else + create_error = -EROFS; } - + if (create_error) + open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) @@ -3071,7 +3056,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, return dentry; } -no_open: if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags);