From 8fd3395ec9051a52828fcca2328cb50a69dea8ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 Jul 2024 11:49:04 -0400 Subject: [PATCH 01/15] get rid of ...lookup...fdget_rcu() family Once upon a time, predecessors of those used to do file lookup without bumping a refcount, provided that caller held rcu_read_lock() across the lookup and whatever it wanted to read from the struct file found. When struct file allocation switched to SLAB_TYPESAFE_BY_RCU, that stopped being feasible and these primitives started to bump the file refcount for lookup result, requiring the caller to call fput() afterwards. But that turned them pointless - e.g. rcu_read_lock(); file = lookup_fdget_rcu(fd); rcu_read_unlock(); is equivalent to file = fget_raw(fd); and all callers of lookup_fdget_rcu() are of that form. Similarly, task_lookup_fdget_rcu() calls can be replaced with calling fget_task(). task_lookup_next_fdget_rcu() doesn't have direct counterparts, but its callers would be happier if we replaced it with an analogue that deals with RCU internally. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/coredump.c | 4 +-- fs/file.c | 28 +++----------------- fs/gfs2/glock.c | 12 ++------- fs/notify/dnotify/dnotify.c | 5 +--- fs/proc/fd.c | 12 +++------ include/linux/fdtable.h | 4 --- include/linux/file.h | 1 + kernel/bpf/task_iter.c | 6 +---- kernel/kcmp.c | 4 +-- 9 files changed, 14 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 18daafbe2e65..301ee7d8b7df 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -73,9 +73,7 @@ static struct spu_context *coredump_next_context(int *fd) return NULL; *fd = n - 1; - rcu_read_lock(); - file = lookup_fdget_rcu(*fd); - rcu_read_unlock(); + file = fget_raw(*fd); if (file) { ctx = SPUFS_I(file_inode(file))->i_ctx; get_spu_context(ctx); diff --git a/fs/file.c b/fs/file.c index eb093e736972..991860ee7848 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1037,29 +1037,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *lookup_fdget_rcu(unsigned int fd) -{ - return __fget_files_rcu(current->files, fd, 0); - -} -EXPORT_SYMBOL_GPL(lookup_fdget_rcu); - -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) -{ - /* Must be called with rcu_read_lock held */ - struct files_struct *files; - struct file *file = NULL; - - task_lock(task); - files = task->files; - if (files) - file = __fget_files_rcu(files, fd, 0); - task_unlock(task); - - return file; -} - -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -1069,17 +1047,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int * task_lock(task); files = task->files; if (files) { + rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } + rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fdget_rcu); +EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 269c3bc7fced..4701c4aafbf4 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include "gfs2.h" @@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) i->file = NULL; } - rcu_read_lock(); for(;; i->fd++) { - struct inode *inode; - - i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); + i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } - inode = file_inode(i->file); - if (inode->i_sb == i->sb) + if (file_inode(i->file)->i_sb == i->sb) break; - rcu_read_unlock(); fput(i->file); - rcu_read_lock(); } - rcu_read_unlock(); return i->file; } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5dbef7f5c95..6004dfdfdf0f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -16,7 +16,6 @@ #include #include #include -#include #include static int dir_notify_enable __read_mostly = 1; @@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) new_fsn_mark = NULL; } - rcu_read_lock(); - f = lookup_fdget_rcu(fd); - rcu_read_unlock(); + f = fget_raw(fd); /* if (f != filp) means that we lost a race and another task/thread * actually closed the fd we are still playing with before we grabbed diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 1f54a54bfb91..18d0dddc8e2f 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, fd); - rcu_read_unlock(); + file = fget_task(task, fd); if (file) { *mode = file->f_mode; fput(file); @@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) goto out; - rcu_read_lock(); for (fd = ctx->pos - 2;; fd++) { struct file *f; struct fd_data data; char name[10 + 1]; unsigned int len; - f = task_lookup_next_fdget_rcu(p, &fd); + f = fget_task_next(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; - rcu_read_unlock(); fput(f); data.fd = fd; @@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, if (!proc_fill_cache(file, ctx, name, len, instantiate, p, &data)) - goto out; + break; cond_resched(); - rcu_read_lock(); } - rcu_read_unlock(); out: put_task_struct(p); return 0; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index b1c5722f2b3c..e25e2cb65d30 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -92,10 +92,6 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -struct file *lookup_fdget_rcu(unsigned int fd); -struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd); -struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd); - static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) { return test_bit(fd, files_fdtable(files)->close_on_exec); diff --git a/include/linux/file.h b/include/linux/file.h index f98de143245a..ec4ad5e6a061 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -72,6 +72,7 @@ static inline void fdput(struct fd fd) extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); +extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd); extern void __f_unlock_pos(struct file *); struct fd fdget(unsigned int fd); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..7fe602ca74a0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b0639f21041f..2c596851f8a9 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; - rcu_read_lock(); - file = task_lookup_fdget_rcu(task, idx); - rcu_read_unlock(); + file = fget_task(task, idx); if (file) fput(file); From be5498cac2ddb112c5bd7433d5e834a1a2493427 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2024 23:58:44 -0400 Subject: [PATCH 02/15] remove pointless includes of some of those used to be needed, some had been cargo-culted for no reason... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/fcntl.c | 1 - fs/file_table.c | 1 - fs/notify/fanotify/fanotify.c | 1 - fs/notify/fanotify/fanotify_user.c | 1 - fs/overlayfs/copy_up.c | 1 - fs/proc/base.c | 1 - io_uring/io_uring.c | 1 - kernel/bpf/bpf_inode_storage.c | 1 - kernel/bpf/bpf_task_storage.c | 1 - kernel/bpf/token.c | 1 - kernel/exit.c | 1 - kernel/module/dups.c | 1 - kernel/module/kmod.c | 1 - kernel/umh.c | 1 - net/handshake/request.c | 1 - security/apparmor/domain.c | 1 - 16 files changed, 16 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..8928874c8a2e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..9e46fd4336b0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 224bccaab4cc..24c7c5df4998 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..61b83039771e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include #include diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2ed6ad641a20..ee2cbd044ce6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include "overlayfs.h" diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52..e9d7ddc52f69 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include #include diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b8..5a1676bab998 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include DEFINE_BPF_STORAGE_CACHE(inode_cache); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..1dcddfe537ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/module/dups.c b/kernel/module/dups.c index 9a92f2f8c9d3..bd2149fbe117 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c index 0800d9891692..25f253812512 100644 --- a/kernel/module/kmod.c +++ b/kernel/module/kmod.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/umh.c b/kernel/umh.c index ff1f13a27d29..be9234270777 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/net/handshake/request.c b/net/handshake/request.c index 94d5cef3e048..274d2c89b6b2 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 571158ec6188..2bc34dce9a46 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include From 1fa4ffd8e6f6d001da27f00382af79bad0336091 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 Aug 2024 16:03:19 -0400 Subject: [PATCH 03/15] close_files(): don't bother with xchg() At that point nobody else has references to the victim files_struct; as the matter of fact, the caller will free it immediately after close_files() returns, with no RCU delays or anything of that sort. That's why we are not protecting against fdtable reallocation on expansion, not cleaning the bitmaps, etc. There's no point zeroing the pointers in ->fd[] either, let alone make that an atomic operation. Signed-off-by: Al Viro --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 991860ee7848..8770010170c5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -413,7 +413,7 @@ static struct fdtable *close_files(struct files_struct * files) set = fdt->open_fds[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&fdt->fd[i], NULL); + struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); From cab0515211f483e392d6862021ed008f49058561 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2024 17:48:36 -0400 Subject: [PATCH 04/15] move close_range(2) into fs/file.c, fold __close_range() into it We never had callers for __close_range() except for close_range(2) itself. Nothing of that sort has appeared in four years and if any users do show up, we can always separate those suckers again. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file.c | 6 ++++-- fs/open.c | 17 ----------------- include/linux/fdtable.h | 1 - 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/fs/file.c b/fs/file.c index 8770010170c5..8e8f504782bf 100644 --- a/fs/file.c +++ b/fs/file.c @@ -713,7 +713,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, } /** - * __close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close @@ -721,8 +721,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. + * Currently, errors to close a given file descriptor are ignored. */ -int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) +SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, + unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; diff --git a/fs/open.c b/fs/open.c index acaeb3e25c88..62dd1383d6f9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1574,23 +1574,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -/** - * sys_close_range() - Close all file descriptors in a given range. - * - * @fd: starting file descriptor to close - * @max_fd: last file descriptor to close - * @flags: reserved for future extensions - * - * This closes a range of file descriptors. All file descriptors - * from @fd up to and including @max_fd are closed. - * Currently, errors to close a given file descriptor are ignored. - */ -SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, - unsigned int, flags) -{ - return __close_range(fd, max_fd, flags); -} - /* * This routine simulates a hangup on the tty, to arrange that users * are given clean terminals at login time. diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e25e2cb65d30..c45306a9f007 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -111,7 +111,6 @@ int iterate_fd(struct files_struct *, unsigned, const void *); extern int close_fd(unsigned int fd); -extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); extern struct file *file_close_fd(unsigned int fd); extern struct kmem_cache *files_cachep; From 52732bb9abc9ee5b82ed62edef51be4a255fc78a Mon Sep 17 00:00:00 2001 From: Yu Ma Date: Wed, 17 Jul 2024 10:50:16 -0400 Subject: [PATCH 05/15] fs/file.c: remove sanity_check and add likely/unlikely in alloc_fd() alloc_fd() has a sanity check inside to make sure the struct file mapping to the allocated fd is NULL. Remove this sanity check since it can be assured by exisitng zero initilization and NULL set when recycling fd. Meanwhile, add likely/unlikely and expand_file() call avoidance to reduce the work under file_lock. Reviewed-by: Jan Kara Reviewed-by: Tim Chen Signed-off-by: Yu Ma Link: https://lore.kernel.org/r/20240717145018.3972922-2-yu.ma@intel.com Signed-off-by: Christian Brauner Signed-off-by: Al Viro --- fs/file.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/file.c b/fs/file.c index 8e8f504782bf..90b8aa2378cc 100644 --- a/fs/file.c +++ b/fs/file.c @@ -496,7 +496,7 @@ repeat: if (fd < files->next_fd) fd = files->next_fd; - if (fd < fdt->max_fds) + if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* @@ -504,19 +504,21 @@ repeat: * will limit the total number of files that can be opened. */ error = -EMFILE; - if (fd >= end) + if (unlikely(fd >= end)) goto out; - error = expand_files(files, fd); - if (error < 0) - goto out; + if (unlikely(fd >= fdt->max_fds)) { + error = expand_files(files, fd); + if (error < 0) + goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) - goto repeat; + /* + * If we needed to expand the fs array we + * might have blocked - try again. + */ + if (error) + goto repeat; + } if (start <= files->next_fd) files->next_fd = fd + 1; @@ -527,13 +529,6 @@ repeat: else __clear_close_on_exec(fd, fdt); error = fd; -#if 1 - /* Sanity check */ - if (rcu_access_pointer(fdt->fd[fd]) != NULL) { - printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); - } -#endif out: spin_unlock(&files->file_lock); @@ -599,7 +594,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); + WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; From c9a3019603b8a8519f1b6d8ae0059bcb2965f8fe Mon Sep 17 00:00:00 2001 From: Yu Ma Date: Wed, 17 Jul 2024 10:50:17 -0400 Subject: [PATCH 06/15] fs/file.c: conditionally clear full_fds 64 bits in open_fds are mapped to a common bit in full_fds_bits. It is very likely that a bit in full_fds_bits has been cleared before in __clear_open_fds()'s operation. Check the clear bit in full_fds_bits before clearing to avoid unnecessary write and cache bouncing. See commit fc90888d07b8 ("vfs: conditionally clear close-on-exec flag") for a similar optimization. take stock kernel with patch 1 as baseline, it improves pts/blogbench-1.1.0 read for 13%, and write for 5% on Intel ICX 160 cores configuration with v6.10-rc7. Reviewed-by: Jan Kara Reviewed-by: Tim Chen Signed-off-by: Yu Ma Link: https://lore.kernel.org/r/20240717145018.3972922-3-yu.ma@intel.com Signed-off-by: Christian Brauner Signed-off-by: Al Viro --- fs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 90b8aa2378cc..36c5089812f5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -264,7 +264,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); - __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); + fd /= BITS_PER_LONG; + if (test_bit(fd, fdt->full_fds_bits)) + __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) From 0c40bf47cf2d9e1413b1e62826c89c2341e66e40 Mon Sep 17 00:00:00 2001 From: Yu Ma Date: Wed, 17 Jul 2024 10:50:18 -0400 Subject: [PATCH 07/15] fs/file.c: add fast path in find_next_fd() Skip 2-levels searching via find_next_zero_bit() when there is free slot in the word contains next_fd, as: (1) next_fd indicates the lower bound for the first free fd. (2) There is fast path inside of find_next_zero_bit() when size<=64 to speed up searching. (3) After fdt is expanded (the bitmap size doubled for each time of expansion), it would never be shrunk. The search size increases but there are few open fds available here. This fast path is proposed by Mateusz Guzik , and agreed by Jan Kara , which is more generic and scalable than previous versions. And on top of patch 1 and 2, it improves pts/blogbench-1.1.0 read by 8% and write by 4% on Intel ICX 160 cores configuration with v6.10-rc7. Reviewed-by: Jan Kara Reviewed-by: Tim Chen Signed-off-by: Yu Ma Link: https://lore.kernel.org/r/20240717145018.3972922-4-yu.ma@intel.com Signed-off-by: Christian Brauner Signed-off-by: Al Viro --- fs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/file.c b/fs/file.c index 36c5089812f5..236d8bbadb0e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -472,6 +472,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; + unsigned int bit; + + /* + * Try to avoid looking at the second level bitmap + */ + bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, + start & (BITS_PER_LONG - 1)); + if (bit < BITS_PER_LONG) + return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) From 1d3b4bec3ce55e0c46cdce7d0402dbd6b4af3a3d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 Aug 2024 22:14:07 -0400 Subject: [PATCH 08/15] alloc_fdtable(): change calling conventions. First of all, tell it how many slots do we want, not which slot is wanted. It makes one caller (dup_fd()) more straightforward and doesn't harm another (expand_fdtable()). Furthermore, make it return ERR_PTR() on failure rather than returning NULL. Simplifies the callers. Simplify the size calculation, while we are at it - note that we always have slots_wanted greater than BITS_PER_LONG. What the rules boil down to is * use the smallest power of two large enough to give us that many slots * on 32bit skip 64 and 128 - the minimal capacity we want there is 256 slots (i.e. 1Kb fd array). * on 64bit don't skip anything, the minimal capacity is 128 - and we'll never be asked for 64 or less. 128 slots means 1Kb fd array, again. * on 128bit, if that ever happens, don't skip anything - we'll never be asked for 128 or less, so the fd array allocation will be at least 2Kb. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file.c | 75 +++++++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 46 deletions(-) diff --git a/fs/file.c b/fs/file.c index 236d8bbadb0e..7e5e9803a173 100644 --- a/fs/file.c +++ b/fs/file.c @@ -89,18 +89,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". - * - * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied - * by that "1024/sizeof(ptr)" before, we already know there are sufficient - * clear low bits. Clang seems to realize that, gcc ends up being confused. - * - * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, - * let's consider it documentation (and maybe a test-case for gcc to improve - * its code generation ;) */ -static struct fdtable * alloc_fdtable(unsigned int nr) +static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; + unsigned int nr; void *data; /* @@ -108,22 +101,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr) * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B - * and growing in powers of two from there on. + * and growing in powers of two from there on. Since we called only + * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab + * already gives BITS_PER_LONG slots), the above boils down to + * 1. use the smallest power of two large enough to give us that many + * slots. + * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is + * 256 slots (i.e. 1Kb fd array). + * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there + * and we are never going to be asked for 64 or less. */ - nr /= (1024 / sizeof(struct file *)); - nr = roundup_pow_of_two(nr + 1); - nr *= (1024 / sizeof(struct file *)); - nr = ALIGN(nr, BITS_PER_LONG); + if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) + nr = 256; + else + nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open - * had been set lower between the check in expand_files() and here. Deal - * with that in caller, it's cheaper that way. + * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ - if (unlikely(nr > sysctl_nr_open)) - nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + if (unlikely(nr > sysctl_nr_open)) { + nr = round_down(sysctl_nr_open, BITS_PER_LONG); + if (nr < slots_wanted) + return ERR_PTR(-EMFILE); + } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) @@ -152,7 +155,7 @@ out_arr: out_fdt: kfree(fdt); out: - return NULL; + return ERR_PTR(-ENOMEM); } /* @@ -169,7 +172,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - new_fdt = alloc_fdtable(nr); + new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. @@ -178,16 +181,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) synchronize_rcu(); spin_lock(&files->file_lock); - if (!new_fdt) - return -ENOMEM; - /* - * extremely unlikely race - sysctl_nr_open decreased between the check in - * caller and alloc_fdtable(). Cheaper to catch it here... - */ - if (unlikely(new_fdt->max_fds <= nr)) { - __free_fdtable(new_fdt); - return -EMFILE; - } + if (IS_ERR(new_fdt)) + return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); @@ -308,7 +303,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; - int error; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) @@ -340,17 +334,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); - new_fdt = alloc_fdtable(open_files - 1); - if (!new_fdt) { - error = -ENOMEM; - goto out_release; - } - - /* beyond sysctl_nr_open; nothing to do */ - if (unlikely(new_fdt->max_fds < open_files)) { - __free_fdtable(new_fdt); - error = -EMFILE; - goto out_release; + new_fdt = alloc_fdtable(open_files); + if (IS_ERR(new_fdt)) { + kmem_cache_free(files_cachep, newf); + return ERR_CAST(new_fdt); } /* @@ -391,10 +378,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho rcu_assign_pointer(newf->fdt, new_fdt); return newf; - -out_release: - kmem_cache_free(files_cachep, newf); - return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) From e880d33b49e62a76a23d2dcdb32e088a6553d299 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Aug 2024 00:41:24 -0400 Subject: [PATCH 09/15] file.c: merge __{set,clear}_close_on_exec() they are always go in pairs; seeing that they are inlined, might as well make that a single inline function taking a boolean argument ("do we want close_on_exec set for that descriptor") Signed-off-by: Al Viro --- fs/file.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/fs/file.c b/fs/file.c index 7e5e9803a173..d8fccd4796a9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -237,15 +237,15 @@ repeat: return expanded; } -static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) +static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, + bool set) { - __set_bit(fd, fdt->close_on_exec); -} - -static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) -{ - if (test_bit(fd, fdt->close_on_exec)) - __clear_bit(fd, fdt->close_on_exec); + if (set) { + __set_bit(fd, fdt->close_on_exec); + } else { + if (test_bit(fd, fdt->close_on_exec)) + __clear_bit(fd, fdt->close_on_exec); + } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) @@ -518,10 +518,7 @@ repeat: files->next_fd = fd + 1; __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, fdt, flags & O_CLOEXEC); error = fd; out: @@ -1147,13 +1144,8 @@ void __f_unlock_pos(struct file *f) void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; - struct fdtable *fdt; spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (flag) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } @@ -1195,10 +1187,7 @@ __releases(&files->file_lock) get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); - if (flags & O_CLOEXEC) - __set_close_on_exec(fd, fdt); - else - __clear_close_on_exec(fd, fdt); + __set_close_on_exec(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) From 8b1bc2590af61129b82a189e9dc7c2804c34400e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 7 Oct 2024 16:23:57 +0200 Subject: [PATCH 10/15] fs: protect backing files with rcu Currently backing files are not under any form of rcu protection. Switching to file_ref requires rcu protection and so does the speculative vma lookup. Switch backing files to the same rcu slab type as regular files. There should be no additional magic required as the lifetime of a backing file is always tied to a regular file. Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-1-387e24dc9163@kernel.org Signed-off-by: Christian Brauner --- fs/file_table.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..4b23eb7b79dd 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -40,13 +40,17 @@ static struct files_stat_struct files_stat = { /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __ro_after_init; +static struct kmem_cache *bfilp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; /* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path user_path; + union { + struct path user_path; + freeptr_t bf_freeptr; + }; }; static inline struct backing_file *backing_file(struct file *f) @@ -68,7 +72,7 @@ static inline void file_free(struct file *f) put_cred(f->f_cred); if (unlikely(f->f_mode & FMODE_BACKING)) { path_put(backing_file_user_path(f)); - kfree(backing_file(f)); + kmem_cache_free(bfilp_cachep, backing_file(f)); } else { kmem_cache_free(filp_cachep, f); } @@ -267,13 +271,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) struct backing_file *ff; int error; - ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + ff = kmem_cache_zalloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); error = init_file(&ff->file, flags, cred); if (unlikely(error)) { - kfree(ff); + kmem_cache_free(bfilp_cachep, ff); return ERR_PTR(error); } @@ -529,6 +533,11 @@ void __init files_init(void) filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); + + args.freeptr_offset = offsetof(struct backing_file, bf_freeptr); + bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file), + &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } From b8ea429d7249253ec1fe90dffc648f0668d12385 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 Aug 2024 19:51:39 -0400 Subject: [PATCH 11/15] make __set_open_fd() set cloexec state as well ->close_on_exec[] state is maintained only for opened descriptors; as the result, anything that marks a descriptor opened has to set its cloexec state explicitly. As the result, all calls of __set_open_fd() are followed by __set_close_on_exec(); might as well fold it into __set_open_fd() so that cloexec state is defined as soon as the descriptor is marked opened. [braino fix folded] Signed-off-by: Al Viro --- fs/file.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/file.c b/fs/file.c index d8fccd4796a9..d468a9b6ef4d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -248,9 +248,10 @@ static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, } } -static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) +static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); + __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); @@ -517,8 +518,7 @@ repeat: if (start <= files->next_fd) files->next_fd = fd + 1; - __set_open_fd(fd, fdt); - __set_close_on_exec(fd, fdt, flags & O_CLOEXEC); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; out: @@ -1186,8 +1186,7 @@ __releases(&files->file_lock) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); - __set_open_fd(fd, fdt); - __set_close_on_exec(fd, fdt, flags & O_CLOEXEC); + __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) From 6a8126f077f9d1f33613c9fa3dbd9a6774c6c4dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Aug 2024 17:38:21 -0400 Subject: [PATCH 12/15] expand_files(): simplify calling conventions All callers treat 0 and 1 returned by expand_files() in the same way now since the call in alloc_fd() had been made conditional. Just make it return 0 on success and be done with it... Signed-off-by: Al Viro --- fs/file.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/file.c b/fs/file.c index d468a9b6ef4d..7251d215048d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -162,7 +162,7 @@ out: * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. - * Return <0 error code on error; 1 on successful completion. + * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) @@ -191,15 +191,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); - return 1; + return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. - * Return <0 error code on error; 0 when nothing done; 1 when files were - * expanded and execution may have blocked. + * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) @@ -207,14 +206,14 @@ static int expand_files(struct files_struct *files, unsigned int nr) __acquires(files->file_lock) { struct fdtable *fdt; - int expanded = 0; + int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return expanded; + return 0; /* Can we expand? */ if (nr >= sysctl_nr_open) @@ -222,7 +221,6 @@ repeat: if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); - expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; @@ -230,11 +228,11 @@ repeat: /* All good, so we try */ files->resize_in_progress = true; - expanded = expand_fdtable(files, nr); + error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); - return expanded; + return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, @@ -507,12 +505,7 @@ repeat: if (error < 0) goto out; - /* - * If we needed to expand the fs array we - * might have blocked - try again. - */ - if (error) - goto repeat; + goto repeat; } if (start <= files->next_fd) From 08ef26ea9ab315b895d57f8fbad41e02ff345bb9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 7 Oct 2024 16:23:58 +0200 Subject: [PATCH 13/15] fs: add file_ref As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. As suggested by Linus, add a file specific variant instead of making this a generic library. Files are SLAB_TYPESAFE_BY_RCU and thus don't have "regular" rcu protection. In short, freeing of files isn't delayed until a grace period has elapsed. Instead, they are freed immediately and thus can be reused (multiple times) within the same grace period. So when picking a file from the file descriptor table via its file descriptor number it is thus possible to see an elevated reference count on file->f_count even though the file has already been recycled possibly multiple times by another task. To guard against this the vfs will pick the file from the file descriptor table twice. Once before the refcount increment and once after to compare the pointers (grossly simplified). If they match then the file is still valid. If not the caller needs to fput() it. The unconditional increment makes the following race possible as illustrated by rcuref: > Deconstruction race > =================== > > The release operation must be protected by prohibiting a grace period in > order to prevent a possible use after free: > > T1 T2 > put() get() > // ref->refcnt = ONEREF > if (!atomic_add_negative(-1, &ref->refcnt)) > return false; <- Not taken > > // ref->refcnt == NOREF > --> preemption > // Elevates ref->refcnt to ONEREF > if (!atomic_add_negative(1, &ref->refcnt)) > return true; <- taken > > if (put(&p->ref)) { <-- Succeeds > remove_pointer(p); > kfree_rcu(p, rcu); > } > > RCU grace period ends, object is freed > > atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF > > [...] it prevents the grace period which keeps the object alive until > all put() operations complete. Having files by SLAB_TYPESAFE_BY_RCU shouldn't cause any problems for this deconstruction race. Afaict, the only interesting case would be someone freeing the file and someone immediately recycling it within the same grace period and reinitializing file->f_count to ONEREF while a concurrent fput() is doing atomic_cmpxchg(&ref->refcnt, NOREF, DEAD) as in the race above. But this is safe from SLAB_TYPESAFE_BY_RCU's perspective and it should be safe from rcuref's perspective. T1 T2 T3 fput() fget() // f_count->refcnt = ONEREF if (!atomic_add_negative(-1, &f_count->refcnt)) return false; <- Not taken // f_count->refcnt == NOREF --> preemption // Elevates f_count->refcnt to ONEREF if (!atomic_add_negative(1, &f_count->refcnt)) return true; <- taken if (put(&f_count)) { <-- Succeeds remove_pointer(p); /* * Cache is SLAB_TYPESAFE_BY_RCU * so this is freed without a grace period. */ kmem_cache_free(p); } kmem_cache_alloc() init_file() { // Sets f_count->refcnt to ONEREF rcuref_long_init(&f->f_count, 1); } Object has been reused within the same grace period via kmem_cache_alloc()'s SLAB_TYPESAFE_BY_RCU. /* * With SLAB_TYPESAFE_BY_RCU this would be a safe UAF access and * it would work correctly because the atomic_cmpxchg() * will fail because the refcount has been reset to ONEREF by T3. */ atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF However, there are other cases to consider: (1) Benign race due to multiple atomic_long_read() CPU1 CPU2 file_ref_put() // last reference // => count goes negative/FILE_REF_NOREF atomic_long_add_negative_release(-1, &ref->refcnt) -> __file_ref_put() file_ref_get() // goes back from negative/FILE_REF_NOREF to 0 // and file_ref_get() succeeds atomic_long_add_negative(1, &ref->refcnt) // This is immediately followed by file_ref_put() // managing to set FILE_REF_DEAD file_ref_put() // __file_ref_put() continues and sees // cnt > FILE_REF_RELEASED // and splats with // "imbalanced put on file reference count" cnt = atomic_long_read(&ref->refcnt); The race however is benign and the problem is the atomic_long_read(). Instead of performing a separate read this uses atomic_long_dec_return() and pass the value to __file_ref_put(). Thanks to Linus for pointing out that braino. (2) SLAB_TYPESAFE_BY_RCU may cause recycled files to be marked dead When a file is recycled the following race exists: CPU1 CPU2 // @file is already dead and thus // cnt >= FILE_REF_RELEASED. file_ref_get(file) atomic_long_add_negative(1, &ref->refcnt) // We thus call into __file_ref_get() -> __file_ref_get() // which sees cnt >= FILE_REF_RELEASED cnt = atomic_long_read(&ref->refcnt); // In the meantime @file gets freed kmem_cache_free() // and is immediately recycled file = kmem_cache_zalloc() // and the reference count is reinitialized // and the file alive again in someone // else's file descriptor table file_ref_init(&ref->refcnt, 1); // the __file_ref_get() slowpath now continues // and as it saw earlier that cnt >= FILE_REF_RELEASED // it wants to ensure that we're staying in the middle // of the deadzone and unconditionally sets // FILE_REF_DEAD. // This marks @file dead for CPU2... atomic_long_set(&ref->refcnt, FILE_REF_DEAD); // Caller issues a close() system call to close @file close(fd) file = file_close_fd_locked() filp_flush() // The caller sees that cnt >= FILE_REF_RELEASED // and warns the first time... CHECK_DATA_CORRUPTION(file_count(file) == 0) // and then splats a second time because // __file_ref_put() sees cnt >= FILE_REF_RELEASED file_ref_put(&ref->refcnt); -> __file_ref_put() My initial inclination was to replace the unconditional atomic_long_set() with an atomic_long_try_cmpxchg() but Linus pointed out that: > I think we should just make file_ref_get() do a simple > > return !atomic_long_add_negative(1, &ref->refcnt)); > > and nothing else. Yes, multiple CPU's can race, and you can increment > more than once, but the gap - even on 32-bit - between DEAD and > becoming close to REF_RELEASED is so big that we simply don't care. > That's the point of having a gap. I've been testing this with will-it-scale using fstat() on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on 256+ threads. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202410151043.5d224a27-oliver.sang@intel.com Closes: https://lore.kernel.org/all/202410151611.f4cd71f2-oliver.sang@intel.com Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-2-387e24dc9163@kernel.org Signed-off-by: Christian Brauner --- fs/file.c | 63 ++++++++++++++ include/linux/file_ref.h | 177 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 include/linux/file_ref.h diff --git a/fs/file.c b/fs/file.c index eb093e736972..ab15e2a3d280 100644 --- a/fs/file.c +++ b/fs/file.c @@ -20,10 +20,73 @@ #include #include #include +#include #include #include "internal.h" +/** + * __file_ref_put - Slowpath of file_ref_put() + * @ref: Pointer to the reference count + * @cnt: Current reference count + * + * Invoked when the reference count is outside of the valid zone. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +bool __file_ref_put(file_ref_t *ref, unsigned long cnt) +{ + /* Did this drop the last reference? */ + if (likely(cnt == FILE_REF_NOREF)) { + /* + * Carefully try to set the reference count to FILE_REF_DEAD. + * + * This can fail if a concurrent get() operation has + * elevated it again or the corresponding put() even marked + * it dead already. Both are valid situations and do not + * require a retry. If this fails the caller is not + * allowed to deconstruct the object. + */ + if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) + return false; + + /* + * The caller can safely schedule the object for + * deconstruction. Provide acquire ordering. + */ + smp_acquire__after_ctrl_dep(); + return true; + } + + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} +EXPORT_SYMBOL_GPL(__file_ref_put); + unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h new file mode 100644 index 000000000000..9b3a8d9b17ab --- /dev/null +++ b/include/linux/file_ref.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_FILE_REF_H +#define _LINUX_FILE_REF_H + +#include +#include +#include + +/* + * file_ref is a reference count implementation specifically for use by + * files. It takes inspiration from rcuref but differs in key aspects + * such as support for SLAB_TYPESAFE_BY_RCU type caches. + * + * FILE_REF_ONEREF FILE_REF_MAXREF + * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL + * <-------------------valid -------------------> + * + * FILE_REF_SATURATED + * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL + * <-----------------------saturation zone----------------------> + * + * FILE_REF_RELEASED FILE_REF_DEAD + * 0xC000000000000000UL 0xE000000000000000UL + * <-------------------dead zone-------------------> + * + * FILE_REF_NOREF + * 0xFFFFFFFFFFFFFFFFUL + */ + +#ifdef CONFIG_64BIT +#define FILE_REF_ONEREF 0x0000000000000000UL +#define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL +#define FILE_REF_SATURATED 0xA000000000000000UL +#define FILE_REF_RELEASED 0xC000000000000000UL +#define FILE_REF_DEAD 0xE000000000000000UL +#define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL +#else +#define FILE_REF_ONEREF 0x00000000U +#define FILE_REF_MAXREF 0x7FFFFFFFU +#define FILE_REF_SATURATED 0xA0000000U +#define FILE_REF_RELEASED 0xC0000000U +#define FILE_REF_DEAD 0xE0000000U +#define FILE_REF_NOREF 0xFFFFFFFFU +#endif + +typedef struct { +#ifdef CONFIG_64BIT + atomic64_t refcnt; +#else + atomic_t refcnt; +#endif +} file_ref_t; + +/** + * file_ref_init - Initialize a file reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) +{ + atomic_long_set(&ref->refcnt, cnt - 1); +} + +bool __file_ref_put(file_ref_t *ref, unsigned long cnt); + +/** + * file_ref_get - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. + * + * Provides full memory ordering. + * + * Return: False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already. True if a reference + * was successfully acquired + */ +static __always_inline __must_check bool file_ref_get(file_ref_t *ref) +{ + /* + * Unconditionally increase the reference count with full + * ordering. The saturation and dead zones provide enough + * tolerance for this. + * + * If this indicates negative the file in question the fail can + * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. + * Hence, unconditionally altering the file reference count to + * e.g., reset the file reference count back to the middle of + * the deadzone risk end up marking someone else's file as dead + * behind their back. + * + * It would be possible to do a careful: + * + * cnt = atomic_long_inc_return(); + * if (likely(cnt >= 0)) + * return true; + * + * and then something like: + * + * if (cnt >= FILE_REF_RELEASE) + * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), + * + * to set the value back to the middle of the deadzone. But it's + * practically impossible to go from FILE_REF_DEAD to + * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 + * file_ref_get()s to resurrect such a dead file. + */ + return !atomic_long_add_negative(1, &ref->refcnt); +} + +/** + * file_ref_inc - Acquire one reference on a file + * @ref: Pointer to the reference count + * + * Acquire an additional reference on a file. Warns if the caller didn't + * already hold a reference. + */ +static __always_inline void file_ref_inc(file_ref_t *ref) +{ + long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); + WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); +} + +/** + * file_ref_put -- Release a file reference + * @ref: Pointer to the reference count + * + * Provides release memory ordering, such that prior loads and stores + * are done before, and provides an acquire ordering on success such + * that free() must come after. + * + * Return: True if this was the last reference with no future references + * possible. This signals the caller that it can safely release + * the object which is protected by the reference counter. + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * release the protected object. + */ +static __always_inline __must_check bool file_ref_put(file_ref_t *ref) +{ + long cnt; + + /* + * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() + * calls don't risk UAFs when a file is recyclyed, it is still + * vulnerable to UAFs caused by freeing the whole slab page once + * it becomes unused. Prevent file_ref_put() from being + * preempted protects against this. + */ + guard(preempt)(); + /* + * Unconditionally decrease the reference count. The saturation + * and dead zones provide enough tolerance for this. If this + * fails then we need to handle the last reference drop and + * cases inside the saturation and dead zones. + */ + cnt = atomic_long_dec_return(&ref->refcnt); + if (cnt >= 0) + return false; + return __file_ref_put(ref, cnt); +} + +/** + * file_ref_read - Read the number of file references + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned long file_ref_read(file_ref_t *ref) +{ + unsigned long c = atomic_long_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= FILE_REF_RELEASED ? 0 : c + 1; +} + +#endif From 90ee6ed776c06435a3fe79c7f5344761f52e1760 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 7 Oct 2024 16:23:59 +0200 Subject: [PATCH 14/15] fs: port files to file_ref Port files to rely on file_ref reference to improve scaling and gain overflow protection. - We continue to WARN during get_file() in case a file that is already marked dead is revived as get_file() is only valid if the caller already holds a reference to the file. This hasn't changed just the check changes. - The semantics for epoll and ttm's dmabuf usage have changed. Both epoll and ttm synchronize with __fput() to prevent the underlying file from beeing freed. (1) epoll Explaining epoll is straightforward using a simple diagram. Essentially, the mutex of the epoll instance needs to be taken in both __fput() and around epi_fget() preventing the file from being freed while it is polled or preventing the file from being resurrected. CPU1 CPU2 fput(file) -> __fput(file) -> eventpoll_release(file) -> eventpoll_release_file(file) mutex_lock(&ep->mtx) epi_item_poll() -> epi_fget() -> file_ref_get(file) mutex_unlock(&ep->mtx) mutex_lock(&ep->mtx); __ep_remove() mutex_unlock(&ep->mtx); -> kmem_cache_free(file) (2) ttm dmabuf This explanation is a bit more involved. A regular dmabuf file stashed the dmabuf in file->private_data and the file in dmabuf->file: file->private_data = dmabuf; dmabuf->file = file; The generic release method of a dmabuf file handles file specific things: f_op->release::dma_buf_file_release() while the generic dentry release method of a dmabuf handles dmabuf freeing including driver specific things: dentry->d_release::dma_buf_release() During ttm dmabuf initialization in ttm_object_device_init() the ttm driver copies the provided struct dma_buf_ops into a private location: struct ttm_object_device { spinlock_t object_lock; struct dma_buf_ops ops; void (*dmabuf_release)(struct dma_buf *dma_buf); struct idr idr; }; ttm_object_device_init(const struct dma_buf_ops *ops) { // copy original dma_buf_ops in private location tdev->ops = *ops; // stash the release method of the original struct dma_buf_ops tdev->dmabuf_release = tdev->ops.release; // override the release method in the copy of the struct dma_buf_ops // with ttm's own dmabuf release method tdev->ops.release = ttm_prime_dmabuf_release; } When a new dmabuf is created the struct dma_buf_ops with the overriden release method set to ttm_prime_dmabuf_release is passed in exp_info.ops: DEFINE_DMA_BUF_EXPORT_INFO(exp_info); exp_info.ops = &tdev->ops; exp_info.size = prime->size; exp_info.flags = flags; exp_info.priv = prime; The call to dma_buf_export() then sets mutex_lock_interruptible(&prime->mutex); dma_buf = dma_buf_export(&exp_info) { dmabuf->ops = exp_info->ops; } mutex_unlock(&prime->mutex); which creates a new dmabuf file and then install a file descriptor to it in the callers file descriptor table: ret = dma_buf_fd(dma_buf, flags); When that dmabuf file is closed we now get: fput(file) -> __fput(file) -> f_op->release::dma_buf_file_release() -> dput() -> d_op->d_release::dma_buf_release() -> dmabuf->ops->release::ttm_prime_dmabuf_release() mutex_lock(&prime->mutex); if (prime->dma_buf == dma_buf) prime->dma_buf = NULL; mutex_unlock(&prime->mutex); Where we can see that prime->dma_buf is set to NULL. So when we have the following diagram: CPU1 CPU2 fput(file) -> __fput(file) -> f_op->release::dma_buf_file_release() -> dput() -> d_op->d_release::dma_buf_release() -> dmabuf->ops->release::ttm_prime_dmabuf_release() ttm_prime_handle_to_fd() mutex_lock_interruptible(&prime->mutex) dma_buf = prime->dma_buf dma_buf && get_dma_buf_unless_doomed(dma_buf) -> file_ref_get(dma_buf->file) mutex_unlock(&prime->mutex); mutex_lock(&prime->mutex); if (prime->dma_buf == dma_buf) prime->dma_buf = NULL; mutex_unlock(&prime->mutex); -> kmem_cache_free(file) The logic of the mechanism is the same as for epoll: sync with __fput() preventing the file from being freed. Here the synchronization happens through the ttm instance's prime->mutex. Basically, the lifetime of the dma_buf and the file are tighly coupled. Both (1) and (2) used to call atomic_inc_not_zero() to check whether the file has already been marked dead and then refuse to revive it. This is only safe because both (1) and (2) sync with __fput() and thus prevent kmem_cache_free() on the file being called and thus prevent the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU. Both (1) and (2) have been ported from atomic_inc_not_zero() to file_ref_get(). That means a file that is already in the process of being marked as FILE_REF_DEAD: file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) can be revived again: CPU1 CPU2 file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) file_ref_get() // Brings reference back to FILE_REF_ONEREF atomic_long_add_negative() atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) This is fine and inherent to the file_ref_get()/file_ref_put() semantics. For both (1) and (2) this is safe because __fput() is prevented from making progress if file_ref_get() fails due to the aforementioned synchronization mechanisms. Two cases need to be considered that affect both (1) epoll and (2) ttm dmabuf: (i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but before that fput() can mark the file as FILE_REF_DEAD someone manages to sneak in a file_ref_get() and brings the refcount back from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original fput() doesn't call __fput(). For epoll the poll will finish and for ttm dmabuf the file can be used again. For ttm dambuf this is actually an advantage because it avoids immediately allocating a new dmabuf object. CPU1 CPU2 file_ref_put() cnt = atomic_long_dec_return() -> __file_ref_put(cnt) if (cnt == FIlE_REF_NOREF) file_ref_get() // Brings reference back to FILE_REF_ONEREF atomic_long_add_negative() atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD) (ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and also suceeds in actually marking it FILE_REF_DEAD and then calls into __fput() to free the file. When either (1) or (2) call file_ref_get() they fail as atomic_long_add_negative() will return true. At the same time, both (1) and (2) all file_ref_get() under mutexes that __fput() must also acquire preventing kmem_cache_free() from freeing the file. So while this might be treated as a change in semantics for (1) and (2) it really isn't. It if should end up causing issues this can be fixed by adding a helper that does something like: long cnt = atomic_long_read(&ref->refcnt); do { if (cnt < 0) return false; } while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1)); return true; which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions. - Jann correctly pointed out that kmem_cache_zalloc() cannot be used anymore once files have been ported to file_ref_t. The kmem_cache_zalloc() call will memset() the whole struct file to zero when it is reallocated. This will also set file->f_ref to zero which mens that a concurrent file_ref_get() can return true: CPU1 CPU2 __get_file_rcu() rcu_dereference_raw() close() [frees file] alloc_empty_file() kmem_cache_zalloc() [reallocates same file] memset(..., 0, ...) file_ref_get() [increments 0->1, returns true] init_file() file_ref_init(..., 1) [sets to 0] rcu_dereference_raw() fput() file_ref_put() [decrements 0->FILE_REF_NOREF, frees file] [UAF] causing a concurrent __get_file_rcu() call to acquire a reference to the file that is about to be reallocated and immediately freeing it on realizing that it has been recycled. This causes a UAF for the task that reallocated/recycled the file. This is prevented by switching from kmem_cache_zalloc() to kmem_cache_alloc() and initializing the fields manually. With file->f_ref initialized last. Note that a memset() also isn't guaranteed to atomically update an unsigned long so it's theoretically possible to see torn and therefore bogus counter values. Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org Signed-off-by: Christian Brauner --- drivers/gpu/drm/i915/gt/shmem_utils.c | 2 +- drivers/gpu/drm/vmwgfx/ttm_object.c | 2 +- fs/eventpoll.c | 2 +- fs/file.c | 14 +++++------ fs/file_table.c | 34 ++++++++++++++++++++------- include/linux/fs.h | 10 ++++---- 6 files changed, 40 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c index 1fb6ff77fd89..bb696b29ee2c 100644 --- a/drivers/gpu/drm/i915/gt/shmem_utils.c +++ b/drivers/gpu/drm/i915/gt/shmem_utils.c @@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj) if (i915_gem_object_is_shmem(obj)) { file = obj->base.filp; - atomic_long_inc(&file->f_count); + get_file(file); return file; } diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.c b/drivers/gpu/drm/vmwgfx/ttm_object.c index 3353e97687d1..a17e62867f3b 100644 --- a/drivers/gpu/drm/vmwgfx/ttm_object.c +++ b/drivers/gpu/drm/vmwgfx/ttm_object.c @@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev) */ static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf) { - return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L; + return file_ref_get(&dmabuf->file->f_ref); } /** diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..212383cefe6c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1002,7 +1002,7 @@ static struct file *epi_fget(const struct epitem *epi) struct file *file; file = epi->ffd.file; - if (!atomic_long_inc_not_zero(&file->f_count)) + if (!file_ref_get(&file->f_ref)) file = NULL; return file; } diff --git a/fs/file.c b/fs/file.c index ab15e2a3d280..9598c577f713 100644 --- a/fs/file.c +++ b/fs/file.c @@ -902,7 +902,7 @@ static struct file *__get_file_rcu(struct file __rcu **f) if (!file) return NULL; - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); @@ -916,8 +916,8 @@ static struct file *__get_file_rcu(struct file __rcu **f) OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* - * atomic_long_inc_not_zero() above provided a full memory - * barrier when we acquired a reference. + * file_ref_get() above provided a full memory barrier when we + * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still @@ -1015,11 +1015,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * We need to confirm it by incrementing the refcount * and then check the lookup again. * - * atomic_long_inc_not_zero() gives us a full memory - * barrier. We only really need an 'acquire' one to - * protect the loads below, but we don't have that. + * file_ref_get() gives us a full memory barrier. We + * only really need an 'acquire' one to protect the + * loads below, but we don't have that. */ - if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + if (unlikely(!file_ref_get(&file->f_ref))) continue; /* diff --git a/fs/file_table.c b/fs/file_table.c index 4b23eb7b79dd..db4fde6fe620 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -169,16 +169,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ + memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->f_ra, 0, sizeof(f->f_ra)); + + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + + f->f_op = NULL; + f->f_mapping = NULL; + f->private_data = NULL; + f->f_inode = NULL; + f->f_owner = NULL; +#ifdef CONFIG_EPOLL + f->f_ep = NULL; +#endif + + f->f_iocb_flags = 0; + f->f_pos = 0; + f->f_wb_err = 0; + f->f_sb_err = 0; /* * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ - atomic_long_set(&f->f_count, 1); + file_ref_init(&f->f_ref, 1); return 0; } @@ -210,7 +226,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -244,7 +260,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (unlikely(!f)) return ERR_PTR(-ENOMEM); @@ -271,7 +287,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) struct backing_file *ff; int error; - ff = kmem_cache_zalloc(bfilp_cachep, GFP_KERNEL); + ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL); if (unlikely(!ff)) return ERR_PTR(-ENOMEM); @@ -483,7 +499,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (file_ref_put(&file->f_ref)) { struct task_struct *task = current; if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { @@ -516,7 +532,7 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (file_ref_put(&file->f_ref)) __fput(file); } diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..c13f648a1c13 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -1005,7 +1006,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_count: reference count + * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1030,7 +1031,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { - atomic_long_t f_count; + file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1078,15 +1079,14 @@ struct file_handle { static inline struct file *get_file(struct file *f) { - long prior = atomic_long_fetch_inc_relaxed(&f->f_count); - WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n"); + file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); -#define file_count(x) atomic_long_read(&(x)->f_count) +#define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) From aab154a442f9ba2a08fc130dbc8d178a33e10345 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 21 Oct 2024 13:54:46 +0200 Subject: [PATCH 15/15] selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor Add a simple file stressor that lives directly in-tree. This will create a bunch of processes that each open 500 file descriptors and then use close_range() to close them all. Concurrently, other processes read /proc//fd/ which rougly does f = fget_task_next(p, &fd); if (!f) break; data.mode = f->f_mode; fput(f); Which means that it'll try to get a reference to a file in another task's file descriptor table. Under heavy file load it is increasingly likely that the other task will manage to close @file and @file will be recycled due to SLAB_TYPEAFE_BY_RCU concurrently. This will trigger various warnings in the file reference counting code. Link: https://lore.kernel.org/r/20241021-vergab-streuen-924df15dceb9@brauner Signed-off-by: Christian Brauner --- .../testing/selftests/filesystems/.gitignore | 1 + tools/testing/selftests/filesystems/Makefile | 2 +- .../selftests/filesystems/file_stressor.c | 194 ++++++++++++++++++ 3 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/filesystems/file_stressor.c diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore index f0c0ff20d6cf..828b66a10c63 100644 --- a/tools/testing/selftests/filesystems/.gitignore +++ b/tools/testing/selftests/filesystems/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only dnotify_test devpts_pts +file_stressor diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile index c647fd6a0446..66305fc34c60 100644 --- a/tools/testing/selftests/filesystems/Makefile +++ b/tools/testing/selftests/filesystems/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += $(KHDR_INCLUDES) -TEST_GEN_PROGS := devpts_pts +TEST_GEN_PROGS := devpts_pts file_stressor TEST_GEN_PROGS_EXTENDED := dnotify_test include ../lib.mk diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c new file mode 100644 index 000000000000..1136f93a9977 --- /dev/null +++ b/tools/testing/selftests/filesystems/file_stressor.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#include +#include +#include + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, + const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} + +static inline int sys_fsmount(int fd, unsigned int flags, + unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#ifndef MOVE_MOUNT_F_EMPTY_PATH +#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ +#endif + +static inline int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, + to_pathname, flags); +} + +FIXTURE(file_stressor) { + int fd_tmpfs; + int nr_procs; + int max_fds; + pid_t *pids_openers; + pid_t *pids_getdents; + int *fd_proc_pid; +}; + +FIXTURE_SETUP(file_stressor) +{ + int fd_context; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + self->fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(self->fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0); + + self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN); + self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs); + ASSERT_NE(self->pids_openers, NULL); + self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs); + ASSERT_NE(self->pids_getdents, NULL); + self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs); + ASSERT_NE(self->fd_proc_pid, NULL); + self->max_fds = 500; +} + +FIXTURE_TEARDOWN(file_stressor) +{ + for (int i = 0; i < self->nr_procs; i++) { + int wstatus; + pid_t pid; + + pid = waitpid(self->pids_openers[i], &wstatus, 0); + ASSERT_EQ(pid, self->pids_openers[i]); + ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus)); + + pid = waitpid(self->pids_getdents[i], &wstatus, 0); + ASSERT_EQ(pid, self->pids_getdents[i]); + ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus)); + } + free(self->pids_openers); + free(self->pids_getdents); + ASSERT_EQ(close(self->fd_tmpfs), 0); + + umount2("/slab_typesafe_by_rcu", 0); + ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0); +} + +TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2) +{ + for (int i = 0; i < self->nr_procs; i++) { + pid_t pid_self; + + self->pids_openers[i] = fork(); + ASSERT_GE(self->pids_openers[i], 0); + + if (self->pids_openers[i] != 0) + continue; + + self->pids_openers[i] = getpid(); + for (;;) { + for (int i = 0; i < self->max_fds; i++) { + char path[PATH_MAX]; + int fd; + + sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i); + fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644); + if (fd < 0) + continue; + } + + close_range(3, ~0U, 0); + } + + exit(0); + } + + for (int i = 0; i < self->nr_procs; i++) { + char path[PATH_MAX]; + + sprintf(path, "/proc/%d/fd/", self->pids_openers[i]); + self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + ASSERT_GE(self->fd_proc_pid[i], 0); + } + + for (int i = 0; i < self->nr_procs; i++) { + self->pids_getdents[i] = fork(); + ASSERT_GE(self->pids_getdents[i], 0); + + if (self->pids_getdents[i] != 0) + continue; + + self->pids_getdents[i] = getpid(); + for (;;) { + char ents[1024]; + ssize_t nr_read; + + /* + * Concurrently read /proc//fd/ which rougly does: + * + * f = fget_task_next(p, &fd); + * if (!f) + * break; + * data.mode = f->f_mode; + * fput(f); + * + * Which means that it'll try to get a reference to a + * file in another task's file descriptor table. + * + * Under heavy file load it is increasingly likely that + * the other task will manage to close @file and @file + * is being recycled due to SLAB_TYPEAFE_BY_RCU + * concurrently. This will trigger various warnings in + * the file reference counting code. + */ + do { + nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents)); + } while (nr_read >= 0); + + lseek(self->fd_proc_pid[i], 0, SEEK_SET); + } + + exit(0); + } + + ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0); + + for (int i = 0; i < self->nr_procs; i++) { + kill(self->pids_openers[i], SIGKILL); + kill(self->pids_getdents[i], SIGKILL); + } +} + +TEST_HARNESS_MAIN