vfs-6.13.file

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZzcW4gAKCRCRxhvAZXjc
 okF+AP9xTMb2SlnRPBOBd9yFcmVXmQi86TSCUPAEVb+wIldGYwD/RIOdvXYJlp9v
 RgJkU1DC3ddkXtONNDY6gFaP+siIWA0=
 =gMc7
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs file updates from Christian Brauner:
 "This contains changes the changes for files for this cycle:

   - Introduce a new reference counting mechanism for files.

     As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop
     it has O(N^2) behaviour under contention with N concurrent
     operations and it is in a hot path in __fget_files_rcu().

     The rcuref infrastructures remedies this problem by using an
     unconditional increment relying on safe- and dead zones to make
     this work and requiring rcu protection for the data structure in
     question. This not just scales better it also introduces overflow
     protection.

     However, in contrast to generic rcuref, files require a memory
     barrier and thus cannot rely on *_relaxed() atomic operations and
     also require to be built on atomic_long_t as having massive amounts
     of reference isn't unheard of even if it is just an attack.

     This adds a file specific variant instead of making this a generic
     library.

     This has been tested by various people and it gives consistent
     improvement up to 3-5% on workloads with loads of threads.

   - Add a fastpath for find_next_zero_bit(). Skip 2-levels searching
     via find_next_zero_bit() when there is a free slot in the word that
     contains the next fd. This improves pts/blogbench-1.1.0 read by 8%
     and write by 4% on Intel ICX 160.

   - Conditionally clear full_fds_bits since it's very likely that a bit
     in full_fds_bits has been cleared during __clear_open_fds(). This
     improves pts/blogbench-1.1.0 read up to 13%, and write up to 5% on
     Intel ICX 160.

   - Get rid of all lookup_*_fdget_rcu() variants. They were used to
     lookup files without taking a reference count. That became invalid
     once files were switched to SLAB_TYPESAFE_BY_RCU and now we're
     always taking a reference count. Switch to an already existing
     helper and remove the legacy variants.

   - Remove pointless includes of <linux/fdtable.h>.

   - Avoid cmpxchg() in close_files() as nobody else has a reference to
     the files_struct at that point.

   - Move close_range() into fs/file.c and fold __close_range() into it.

   - Cleanup calling conventions of alloc_fdtable() and expand_files().

   - Merge __{set,clear}_close_on_exec() into one.

   - Make __set_open_fd() set cloexec as well instead of doing it in two
     separate steps"

* tag 'vfs-6.13.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  selftests: add file SLAB_TYPESAFE_BY_RCU recycling stressor
  fs: port files to file_ref
  fs: add file_ref
  expand_files(): simplify calling conventions
  make __set_open_fd() set cloexec state as well
  fs: protect backing files with rcu
  file.c: merge __{set,clear}_close_on_exec()
  alloc_fdtable(): change calling conventions.
  fs/file.c: add fast path in find_next_fd()
  fs/file.c: conditionally clear full_fds
  fs/file.c: remove sanity_check and add likely/unlikely in alloc_fd()
  move close_range(2) into fs/file.c, fold __close_range() into it
  close_files(): don't bother with xchg()
  remove pointless includes of <linux/fdtable.h>
  get rid of ...lookup...fdget_rcu() family
This commit is contained in:
Linus Torvalds 2024-11-18 10:30:29 -08:00
commit 4c797b11a8
34 changed files with 576 additions and 226 deletions

View File

@ -73,9 +73,7 @@ static struct spu_context *coredump_next_context(int *fd)
return NULL; return NULL;
*fd = n - 1; *fd = n - 1;
rcu_read_lock(); file = fget_raw(*fd);
file = lookup_fdget_rcu(*fd);
rcu_read_unlock();
if (file) { if (file) {
ctx = SPUFS_I(file_inode(file))->i_ctx; ctx = SPUFS_I(file_inode(file))->i_ctx;
get_spu_context(ctx); get_spu_context(ctx);

View File

@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
if (i915_gem_object_is_shmem(obj)) { if (i915_gem_object_is_shmem(obj)) {
file = obj->base.filp; file = obj->base.filp;
atomic_long_inc(&file->f_count); get_file(file);
return file; return file;
} }

View File

@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev)
*/ */
static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf) static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf)
{ {
return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L; return file_ref_get(&dmabuf->file->f_ref);
} }
/** /**

View File

@ -1003,7 +1003,7 @@ static struct file *epi_fget(const struct epitem *epi)
struct file *file; struct file *file;
file = epi->ffd.file; file = epi->ffd.file;
if (!atomic_long_inc_not_zero(&file->f_count)) if (!file_ref_get(&file->f_ref))
file = NULL; file = NULL;
return file; return file;
} }

View File

@ -12,7 +12,6 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/filelock.h> #include <linux/filelock.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/dnotify.h> #include <linux/dnotify.h>
#include <linux/slab.h> #include <linux/slab.h>

269
fs/file.c
View File

@ -20,10 +20,73 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rcupdate.h> #include <linux/rcupdate.h>
#include <linux/close_range.h> #include <linux/close_range.h>
#include <linux/file_ref.h>
#include <net/sock.h> #include <net/sock.h>
#include "internal.h" #include "internal.h"
/**
* __file_ref_put - Slowpath of file_ref_put()
* @ref: Pointer to the reference count
* @cnt: Current reference count
*
* Invoked when the reference count is outside of the valid zone.
*
* Return:
* True if this was the last reference with no future references
* possible. This signals the caller that it can safely schedule the
* object, which is protected by the reference counter, for
* deconstruction.
*
* False if there are still active references or the put() raced
* with a concurrent get()/put() pair. Caller is not allowed to
* deconstruct the protected object.
*/
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
{
/* Did this drop the last reference? */
if (likely(cnt == FILE_REF_NOREF)) {
/*
* Carefully try to set the reference count to FILE_REF_DEAD.
*
* This can fail if a concurrent get() operation has
* elevated it again or the corresponding put() even marked
* it dead already. Both are valid situations and do not
* require a retry. If this fails the caller is not
* allowed to deconstruct the object.
*/
if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
return false;
/*
* The caller can safely schedule the object for
* deconstruction. Provide acquire ordering.
*/
smp_acquire__after_ctrl_dep();
return true;
}
/*
* If the reference count was already in the dead zone, then this
* put() operation is imbalanced. Warn, put the reference count back to
* DEAD and tell the caller to not deconstruct the object.
*/
if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
return false;
}
/*
* This is a put() operation on a saturated refcount. Restore the
* mean saturation value and tell the caller to not deconstruct the
* object.
*/
if (cnt > FILE_REF_MAXREF)
atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
return false;
}
EXPORT_SYMBOL_GPL(__file_ref_put);
unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */ /* our min() is unusable in constant expressions ;-/ */
@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
* 'unsigned long' in some places, but simply because that is how the Linux * 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes", * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long". * they are very much "bits in an array of unsigned long".
*
* The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
* by that "1024/sizeof(ptr)" before, we already know there are sufficient
* clear low bits. Clang seems to realize that, gcc ends up being confused.
*
* On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
* let's consider it documentation (and maybe a test-case for gcc to improve
* its code generation ;)
*/ */
static struct fdtable * alloc_fdtable(unsigned int nr) static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{ {
struct fdtable *fdt; struct fdtable *fdt;
unsigned int nr;
void *data; void *data;
/* /*
@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
* Allocation steps are keyed to the size of the fdarray, since it * Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit * grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B * the fdarray into comfortable page-tuned chunks: starting at 1024B
* and growing in powers of two from there on. * and growing in powers of two from there on. Since we called only
* with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
* already gives BITS_PER_LONG slots), the above boils down to
* 1. use the smallest power of two large enough to give us that many
* slots.
* 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
* 256 slots (i.e. 1Kb fd array).
* 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
* and we are never going to be asked for 64 or less.
*/ */
nr /= (1024 / sizeof(struct file *)); if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
nr = roundup_pow_of_two(nr + 1); nr = 256;
nr *= (1024 / sizeof(struct file *)); else
nr = ALIGN(nr, BITS_PER_LONG); nr = roundup_pow_of_two(slots_wanted);
/* /*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open * Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal * had been set lower between the check in expand_files() and here.
* with that in caller, it's cheaper that way.
* *
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly... * bitmaps handling below becomes unpleasant, to put it mildly...
*/ */
if (unlikely(nr > sysctl_nr_open)) if (unlikely(nr > sysctl_nr_open)) {
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; nr = round_down(sysctl_nr_open, BITS_PER_LONG);
if (nr < slots_wanted)
return ERR_PTR(-EMFILE);
}
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt) if (!fdt)
@ -152,14 +218,14 @@ out_arr:
out_fdt: out_fdt:
kfree(fdt); kfree(fdt);
out: out:
return NULL; return ERR_PTR(-ENOMEM);
} }
/* /*
* Expand the file descriptor table. * Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of * This function will allocate a new fdtable and both fd array and fdset, of
* the given size. * the given size.
* Return <0 error code on error; 1 on successful completion. * Return <0 error code on error; 0 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit. * The files->file_lock should be held on entry, and will be held on exit.
*/ */
static int expand_fdtable(struct files_struct *files, unsigned int nr) static int expand_fdtable(struct files_struct *files, unsigned int nr)
@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt; struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
new_fdt = alloc_fdtable(nr); new_fdt = alloc_fdtable(nr + 1);
/* make sure all fd_install() have seen resize_in_progress /* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section. * or have finished their rcu_read_lock_sched() section.
@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
synchronize_rcu(); synchronize_rcu();
spin_lock(&files->file_lock); spin_lock(&files->file_lock);
if (!new_fdt) if (IS_ERR(new_fdt))
return -ENOMEM; return PTR_ERR(new_fdt);
/*
* extremely unlikely race - sysctl_nr_open decreased between the check in
* caller and alloc_fdtable(). Cheaper to catch it here...
*/
if (unlikely(new_fdt->max_fds <= nr)) {
__free_fdtable(new_fdt);
return -EMFILE;
}
cur_fdt = files_fdtable(files); cur_fdt = files_fdtable(files);
BUG_ON(nr < cur_fdt->max_fds); BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt); copy_fdtable(new_fdt, cur_fdt);
@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu); call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */ /* coupled with smp_rmb() in fd_install() */
smp_wmb(); smp_wmb();
return 1; return 0;
} }
/* /*
* Expand files. * Expand files.
* This function will expand the file structures, if the requested size exceeds * This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion. * the current capacity and there is room for expansion.
* Return <0 error code on error; 0 when nothing done; 1 when files were * Return <0 error code on error; 0 on success.
* expanded and execution may have blocked.
* The files->file_lock should be held on entry, and will be held on exit. * The files->file_lock should be held on entry, and will be held on exit.
*/ */
static int expand_files(struct files_struct *files, unsigned int nr) static int expand_files(struct files_struct *files, unsigned int nr)
@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr)
__acquires(files->file_lock) __acquires(files->file_lock)
{ {
struct fdtable *fdt; struct fdtable *fdt;
int expanded = 0; int error;
repeat: repeat:
fdt = files_fdtable(files); fdt = files_fdtable(files);
/* Do we need to expand? */ /* Do we need to expand? */
if (nr < fdt->max_fds) if (nr < fdt->max_fds)
return expanded; return 0;
/* Can we expand? */ /* Can we expand? */
if (nr >= sysctl_nr_open) if (nr >= sysctl_nr_open)
@ -227,7 +284,6 @@ repeat:
if (unlikely(files->resize_in_progress)) { if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress); wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock); spin_lock(&files->file_lock);
goto repeat; goto repeat;
@ -235,27 +291,28 @@ repeat:
/* All good, so we try */ /* All good, so we try */
files->resize_in_progress = true; files->resize_in_progress = true;
expanded = expand_fdtable(files, nr); error = expand_fdtable(files, nr);
files->resize_in_progress = false; files->resize_in_progress = false;
wake_up_all(&files->resize_wait); wake_up_all(&files->resize_wait);
return expanded; return error;
} }
static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
bool set)
{ {
if (set) {
__set_bit(fd, fdt->close_on_exec); __set_bit(fd, fdt->close_on_exec);
} } else {
static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
if (test_bit(fd, fdt->close_on_exec)) if (test_bit(fd, fdt->close_on_exec))
__clear_bit(fd, fdt->close_on_exec); __clear_bit(fd, fdt->close_on_exec);
}
} }
static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{ {
__set_bit(fd, fdt->open_fds); __set_bit(fd, fdt->open_fds);
__set_close_on_exec(fd, fdt, set);
fd /= BITS_PER_LONG; fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd]) if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits); __set_bit(fd, fdt->full_fds_bits);
@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{ {
__clear_bit(fd, fdt->open_fds); __clear_bit(fd, fdt->open_fds);
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); fd /= BITS_PER_LONG;
if (test_bit(fd, fdt->full_fds_bits))
__clear_bit(fd, fdt->full_fds_bits);
} }
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
struct file **old_fds, **new_fds; struct file **old_fds, **new_fds;
unsigned int open_files, i; unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt; struct fdtable *old_fdt, *new_fdt;
int error;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf) if (!newf)
@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
if (new_fdt != &newf->fdtab) if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt); __free_fdtable(new_fdt);
new_fdt = alloc_fdtable(open_files - 1); new_fdt = alloc_fdtable(open_files);
if (!new_fdt) { if (IS_ERR(new_fdt)) {
error = -ENOMEM; kmem_cache_free(files_cachep, newf);
goto out_release; return ERR_CAST(new_fdt);
}
/* beyond sysctl_nr_open; nothing to do */
if (unlikely(new_fdt->max_fds < open_files)) {
__free_fdtable(new_fdt);
error = -EMFILE;
goto out_release;
} }
/* /*
@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
rcu_assign_pointer(newf->fdt, new_fdt); rcu_assign_pointer(newf->fdt, new_fdt);
return newf; return newf;
out_release:
kmem_cache_free(files_cachep, newf);
return ERR_PTR(error);
} }
static struct fdtable *close_files(struct files_struct * files) static struct fdtable *close_files(struct files_struct * files)
@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++]; set = fdt->open_fds[j++];
while (set) { while (set) {
if (set & 1) { if (set & 1) {
struct file * file = xchg(&fdt->fd[i], NULL); struct file *file = fdt->fd[i];
if (file) { if (file) {
filp_close(file, files); filp_close(file, files);
cond_resched(); cond_resched();
@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG;
unsigned int bit;
/*
* Try to avoid looking at the second level bitmap
*/
bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
start & (BITS_PER_LONG - 1));
if (bit < BITS_PER_LONG)
return bit + bitbit * BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit >= maxfd) if (bitbit >= maxfd)
@ -496,7 +552,7 @@ repeat:
if (fd < files->next_fd) if (fd < files->next_fd)
fd = files->next_fd; fd = files->next_fd;
if (fd < fdt->max_fds) if (likely(fd < fdt->max_fds))
fd = find_next_fd(fdt, fd); fd = find_next_fd(fdt, fd);
/* /*
@ -504,36 +560,22 @@ repeat:
* will limit the total number of files that can be opened. * will limit the total number of files that can be opened.
*/ */
error = -EMFILE; error = -EMFILE;
if (fd >= end) if (unlikely(fd >= end))
goto out; goto out;
if (unlikely(fd >= fdt->max_fds)) {
error = expand_files(files, fd); error = expand_files(files, fd);
if (error < 0) if (error < 0)
goto out; goto out;
/*
* If we needed to expand the fs array we
* might have blocked - try again.
*/
if (error)
goto repeat; goto repeat;
}
if (start <= files->next_fd) if (start <= files->next_fd)
files->next_fd = fd + 1; files->next_fd = fd + 1;
__set_open_fd(fd, fdt); __set_open_fd(fd, fdt, flags & O_CLOEXEC);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
error = fd; error = fd;
#if 1
/* Sanity check */
if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
#endif
out: out:
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched(); rcu_read_unlock_sched();
spin_lock(&files->file_lock); spin_lock(&files->file_lock);
fdt = files_fdtable(files); fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL); WARN_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file); rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
return; return;
@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
} }
/** /**
* __close_range() - Close all file descriptors in a given range. * sys_close_range() - Close all file descriptors in a given range.
* *
* @fd: starting file descriptor to close * @fd: starting file descriptor to close
* @max_fd: last file descriptor to close * @max_fd: last file descriptor to close
@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
* *
* This closes a range of file descriptors. All file descriptors * This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed. * from @fd up to and including @max_fd are closed.
* Currently, errors to close a given file descriptor are ignored.
*/ */
int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
unsigned int, flags)
{ {
struct task_struct *me = current; struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL; struct files_struct *cur_fds = me->files, *fds = NULL;
@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
if (!file) if (!file)
return NULL; return NULL;
if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) if (unlikely(!file_ref_get(&file->f_ref)))
return ERR_PTR(-EAGAIN); return ERR_PTR(-EAGAIN);
file_reloaded = rcu_dereference_raw(*f); file_reloaded = rcu_dereference_raw(*f);
@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
OPTIMIZER_HIDE_VAR(file_reloaded_cmp); OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
/* /*
* atomic_long_inc_not_zero() above provided a full memory * file_ref_get() above provided a full memory barrier when we
* barrier when we acquired a reference. * acquired a reference.
* *
* This is paired with the write barrier from assigning to the * This is paired with the write barrier from assigning to the
* __rcu protected file pointer so that if that pointer still * __rcu protected file pointer so that if that pointer still
@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* We need to confirm it by incrementing the refcount * We need to confirm it by incrementing the refcount
* and then check the lookup again. * and then check the lookup again.
* *
* atomic_long_inc_not_zero() gives us a full memory * file_ref_get() gives us a full memory barrier. We
* barrier. We only really need an 'acquire' one to * only really need an 'acquire' one to protect the
* protect the loads below, but we don't have that. * loads below, but we don't have that.
*/ */
if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) if (unlikely(!file_ref_get(&file->f_ref)))
continue; continue;
/* /*
@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file; return file;
} }
struct file *lookup_fdget_rcu(unsigned int fd) struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
return __fget_files_rcu(current->files, fd, 0);
}
EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
struct file *file = NULL;
task_lock(task);
files = task->files;
if (files)
file = __fget_files_rcu(files, fd, 0);
task_unlock(task);
return file;
}
struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
{ {
/* Must be called with rcu_read_lock held */ /* Must be called with rcu_read_lock held */
struct files_struct *files; struct files_struct *files;
@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
task_lock(task); task_lock(task);
files = task->files; files = task->files;
if (files) { if (files) {
rcu_read_lock();
for (; fd < files_fdtable(files)->max_fds; fd++) { for (; fd < files_fdtable(files)->max_fds; fd++) {
file = __fget_files_rcu(files, fd, 0); file = __fget_files_rcu(files, fd, 0);
if (file) if (file)
break; break;
} }
rcu_read_unlock();
} }
task_unlock(task); task_unlock(task);
*ret_fd = fd; *ret_fd = fd;
return file; return file;
} }
EXPORT_SYMBOL(task_lookup_next_fdget_rcu); EXPORT_SYMBOL(fget_task_next);
/* /*
* Lightweight file lookup - no refcnt increment if fd table isn't shared. * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@ -1183,13 +1207,8 @@ void __f_unlock_pos(struct file *f)
void set_close_on_exec(unsigned int fd, int flag) void set_close_on_exec(unsigned int fd, int flag)
{ {
struct files_struct *files = current->files; struct files_struct *files = current->files;
struct fdtable *fdt;
spin_lock(&files->file_lock); spin_lock(&files->file_lock);
fdt = files_fdtable(files); __set_close_on_exec(fd, files_fdtable(files), flag);
if (flag)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
} }
@ -1230,11 +1249,7 @@ __releases(&files->file_lock)
goto Ebusy; goto Ebusy;
get_file(file); get_file(file);
rcu_assign_pointer(fdt->fd[fd], file); rcu_assign_pointer(fdt->fd[fd], file);
__set_open_fd(fd, fdt); __set_open_fd(fd, fdt, flags & O_CLOEXEC);
if (flags & O_CLOEXEC)
__set_close_on_exec(fd, fdt);
else
__clear_close_on_exec(fd, fdt);
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
if (tofree) if (tofree)

View File

@ -9,7 +9,6 @@
#include <linux/string.h> #include <linux/string.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/fs.h> #include <linux/fs.h>
@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */ /* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init; static struct kmem_cache *filp_cachep __ro_after_init;
static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp; static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */ /* Container for backing file with optional user path */
struct backing_file { struct backing_file {
struct file file; struct file file;
union {
struct path user_path; struct path user_path;
freeptr_t bf_freeptr;
};
}; };
static inline struct backing_file *backing_file(struct file *f) static inline struct backing_file *backing_file(struct file *f)
@ -68,7 +71,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred); put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) { if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f)); path_put(backing_file_user_path(f));
kfree(backing_file(f)); kmem_cache_free(bfilp_cachep, backing_file(f));
} else { } else {
kmem_cache_free(filp_cachep, f); kmem_cache_free(filp_cachep, f);
} }
@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file. * the respective member when opening the file.
*/ */
mutex_init(&f->f_pos_lock); mutex_init(&f->f_pos_lock);
memset(&f->f_path, 0, sizeof(f->f_path));
memset(&f->f_ra, 0, sizeof(f->f_ra));
f->f_flags = flags; f->f_flags = flags;
f->f_mode = OPEN_FMODE(flags); f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
f->f_op = NULL;
f->f_mapping = NULL;
f->private_data = NULL;
f->f_inode = NULL;
f->f_owner = NULL;
#ifdef CONFIG_EPOLL
f->f_ep = NULL;
#endif
f->f_iocb_flags = 0;
f->f_pos = 0;
f->f_wb_err = 0;
f->f_sb_err = 0;
/* /*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious * fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first. * refcount bumps we should reinitialize the reused file first.
*/ */
atomic_long_set(&f->f_count, 1); file_ref_init(&f->f_ref, 1);
return 0; return 0;
} }
@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over; goto over;
} }
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f)) if (unlikely(!f))
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f; struct file *f;
int error; int error;
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f)) if (unlikely(!f))
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff; struct backing_file *ff;
int error; int error;
ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff)) if (unlikely(!ff))
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred); error = init_file(&ff->file, flags, cred);
if (unlikely(error)) { if (unlikely(error)) {
kfree(ff); kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error); return ERR_PTR(error);
} }
@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file) void fput(struct file *file)
{ {
if (atomic_long_dec_and_test(&file->f_count)) { if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current; struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@ -512,7 +531,7 @@ void fput(struct file *file)
*/ */
void __fput_sync(struct file *file) void __fput_sync(struct file *file)
{ {
if (atomic_long_dec_and_test(&file->f_count)) if (file_ref_put(&file->f_ref))
__fput(file); __fput(file);
} }
@ -529,6 +548,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
&args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL); percpu_counter_init(&nr_files, 0, GFP_KERNEL);
} }

View File

@ -34,7 +34,6 @@
#include <linux/lockref.h> #include <linux/lockref.h>
#include <linux/rhashtable.h> #include <linux/rhashtable.h>
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
#include <linux/fdtable.h>
#include <linux/file.h> #include <linux/file.h>
#include "gfs2.h" #include "gfs2.h"
@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
i->file = NULL; i->file = NULL;
} }
rcu_read_lock();
for(;; i->fd++) { for(;; i->fd++) {
struct inode *inode; i->file = fget_task_next(i->task, &i->fd);
i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
if (!i->file) { if (!i->file) {
i->fd = 0; i->fd = 0;
break; break;
} }
inode = file_inode(i->file); if (file_inode(i->file)->i_sb == i->sb)
if (inode->i_sb == i->sb)
break; break;
rcu_read_unlock();
fput(i->file); fput(i->file);
rcu_read_lock();
} }
rcu_read_unlock();
return i->file; return i->file;
} }

View File

@ -16,7 +16,6 @@
#include <linux/security.h> #include <linux/security.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h> #include <linux/fsnotify_backend.h>
static int dir_notify_enable __read_mostly = 1; static int dir_notify_enable __read_mostly = 1;
@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
new_fsn_mark = NULL; new_fsn_mark = NULL;
} }
rcu_read_lock(); f = fget_raw(fd);
f = lookup_fdget_rcu(fd);
rcu_read_unlock();
/* if (f != filp) means that we lost a race and another task/thread /* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed * actually closed the fd we are still playing with before we grabbed

View File

@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h> #include <linux/fanotify.h>
#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h> #include <linux/fsnotify_backend.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/jiffies.h> #include <linux/jiffies.h>

View File

@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h> #include <linux/fanotify.h>
#include <linux/fcntl.h> #include <linux/fcntl.h>
#include <linux/fdtable.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>

View File

@ -1576,23 +1576,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval; return retval;
} }
/**
* sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
* @flags: reserved for future extensions
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
* Currently, errors to close a given file descriptor are ignored.
*/
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
unsigned int, flags)
{
return __close_range(fd, max_fd, flags);
}
/* /*
* This routine simulates a hangup on the tty, to arrange that users * This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time. * are given clean terminals at login time.

View File

@ -16,7 +16,6 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/exportfs.h> #include <linux/exportfs.h>
#include "overlayfs.h" #include "overlayfs.h"

View File

@ -58,7 +58,6 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h> #include <linux/generic-radix-tree.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>

View File

@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{ {
struct file *file; struct file *file;
rcu_read_lock(); file = fget_task(task, fd);
file = task_lookup_fdget_rcu(task, fd);
rcu_read_unlock();
if (file) { if (file) {
*mode = file->f_mode; *mode = file->f_mode;
fput(file); fput(file);
@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx)) if (!dir_emit_dots(file, ctx))
goto out; goto out;
rcu_read_lock();
for (fd = ctx->pos - 2;; fd++) { for (fd = ctx->pos - 2;; fd++) {
struct file *f; struct file *f;
struct fd_data data; struct fd_data data;
char name[10 + 1]; char name[10 + 1];
unsigned int len; unsigned int len;
f = task_lookup_next_fdget_rcu(p, &fd); f = fget_task_next(p, &fd);
ctx->pos = fd + 2LL; ctx->pos = fd + 2LL;
if (!f) if (!f)
break; break;
data.mode = f->f_mode; data.mode = f->f_mode;
rcu_read_unlock();
fput(f); fput(f);
data.fd = fd; data.fd = fd;
@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx, if (!proc_fill_cache(file, ctx,
name, len, instantiate, p, name, len, instantiate, p,
&data)) &data))
goto out; break;
cond_resched(); cond_resched();
rcu_read_lock();
} }
rcu_read_unlock();
out: out:
put_task_struct(p); put_task_struct(p);
return 0; return 0;

View File

@ -92,10 +92,6 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
return files_lookup_fd_raw(files, fd); return files_lookup_fd_raw(files, fd);
} }
struct file *lookup_fdget_rcu(unsigned int fd);
struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
static inline bool close_on_exec(unsigned int fd, const struct files_struct *files) static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{ {
return test_bit(fd, files_fdtable(files)->close_on_exec); return test_bit(fd, files_fdtable(files)->close_on_exec);
@ -115,7 +111,6 @@ int iterate_fd(struct files_struct *, unsigned,
const void *); const void *);
extern int close_fd(unsigned int fd); extern int close_fd(unsigned int fd);
extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *file_close_fd(unsigned int fd); extern struct file *file_close_fd(unsigned int fd);
extern struct kmem_cache *files_cachep; extern struct kmem_cache *files_cachep;

View File

@ -72,6 +72,7 @@ static inline void fdput(struct fd fd)
extern struct file *fget(unsigned int fd); extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd); extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
extern void __f_unlock_pos(struct file *); extern void __f_unlock_pos(struct file *);
struct fd fdget(unsigned int fd); struct fd fdget(unsigned int fd);

177
include/linux/file_ref.h Normal file
View File

@ -0,0 +1,177 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_FILE_REF_H
#define _LINUX_FILE_REF_H
#include <linux/atomic.h>
#include <linux/preempt.h>
#include <linux/types.h>
/*
* file_ref is a reference count implementation specifically for use by
* files. It takes inspiration from rcuref but differs in key aspects
* such as support for SLAB_TYPESAFE_BY_RCU type caches.
*
* FILE_REF_ONEREF FILE_REF_MAXREF
* 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL
* <-------------------valid ------------------->
*
* FILE_REF_SATURATED
* 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
* <-----------------------saturation zone---------------------->
*
* FILE_REF_RELEASED FILE_REF_DEAD
* 0xC000000000000000UL 0xE000000000000000UL
* <-------------------dead zone------------------->
*
* FILE_REF_NOREF
* 0xFFFFFFFFFFFFFFFFUL
*/
#ifdef CONFIG_64BIT
#define FILE_REF_ONEREF 0x0000000000000000UL
#define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL
#define FILE_REF_SATURATED 0xA000000000000000UL
#define FILE_REF_RELEASED 0xC000000000000000UL
#define FILE_REF_DEAD 0xE000000000000000UL
#define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL
#else
#define FILE_REF_ONEREF 0x00000000U
#define FILE_REF_MAXREF 0x7FFFFFFFU
#define FILE_REF_SATURATED 0xA0000000U
#define FILE_REF_RELEASED 0xC0000000U
#define FILE_REF_DEAD 0xE0000000U
#define FILE_REF_NOREF 0xFFFFFFFFU
#endif
typedef struct {
#ifdef CONFIG_64BIT
atomic64_t refcnt;
#else
atomic_t refcnt;
#endif
} file_ref_t;
/**
* file_ref_init - Initialize a file reference count
* @ref: Pointer to the reference count
* @cnt: The initial reference count typically '1'
*/
static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
{
atomic_long_set(&ref->refcnt, cnt - 1);
}
bool __file_ref_put(file_ref_t *ref, unsigned long cnt);
/**
* file_ref_get - Acquire one reference on a file
* @ref: Pointer to the reference count
*
* Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
*
* Provides full memory ordering.
*
* Return: False if the attempt to acquire a reference failed. This happens
* when the last reference has been put already. True if a reference
* was successfully acquired
*/
static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
{
/*
* Unconditionally increase the reference count with full
* ordering. The saturation and dead zones provide enough
* tolerance for this.
*
* If this indicates negative the file in question the fail can
* be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
* Hence, unconditionally altering the file reference count to
* e.g., reset the file reference count back to the middle of
* the deadzone risk end up marking someone else's file as dead
* behind their back.
*
* It would be possible to do a careful:
*
* cnt = atomic_long_inc_return();
* if (likely(cnt >= 0))
* return true;
*
* and then something like:
*
* if (cnt >= FILE_REF_RELEASE)
* atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
*
* to set the value back to the middle of the deadzone. But it's
* practically impossible to go from FILE_REF_DEAD to
* FILE_REF_ONEREF. It would need 2305843009213693952/2^61
* file_ref_get()s to resurrect such a dead file.
*/
return !atomic_long_add_negative(1, &ref->refcnt);
}
/**
* file_ref_inc - Acquire one reference on a file
* @ref: Pointer to the reference count
*
* Acquire an additional reference on a file. Warns if the caller didn't
* already hold a reference.
*/
static __always_inline void file_ref_inc(file_ref_t *ref)
{
long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
}
/**
* file_ref_put -- Release a file reference
* @ref: Pointer to the reference count
*
* Provides release memory ordering, such that prior loads and stores
* are done before, and provides an acquire ordering on success such
* that free() must come after.
*
* Return: True if this was the last reference with no future references
* possible. This signals the caller that it can safely release
* the object which is protected by the reference counter.
* False if there are still active references or the put() raced
* with a concurrent get()/put() pair. Caller is not allowed to
* release the protected object.
*/
static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
{
long cnt;
/*
* While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
* calls don't risk UAFs when a file is recyclyed, it is still
* vulnerable to UAFs caused by freeing the whole slab page once
* it becomes unused. Prevent file_ref_put() from being
* preempted protects against this.
*/
guard(preempt)();
/*
* Unconditionally decrease the reference count. The saturation
* and dead zones provide enough tolerance for this. If this
* fails then we need to handle the last reference drop and
* cases inside the saturation and dead zones.
*/
cnt = atomic_long_dec_return(&ref->refcnt);
if (cnt >= 0)
return false;
return __file_ref_put(ref, cnt);
}
/**
* file_ref_read - Read the number of file references
* @ref: Pointer to the reference count
*
* Return: The number of held references (0 ... N)
*/
static inline unsigned long file_ref_read(file_ref_t *ref)
{
unsigned long c = atomic_long_read(&ref->refcnt);
/* Return 0 if within the DEAD zone. */
return c >= FILE_REF_RELEASED ? 0 : c + 1;
}
#endif

View File

@ -45,6 +45,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/maple_tree.h> #include <linux/maple_tree.h>
#include <linux/rw_hint.h> #include <linux/rw_hint.h>
#include <linux/file_ref.h>
#include <asm/byteorder.h> #include <asm/byteorder.h>
#include <uapi/linux/fs.h> #include <uapi/linux/fs.h>
@ -1006,7 +1007,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
/** /**
* struct file - Represents a file * struct file - Represents a file
* @f_count: reference count * @f_ref: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths * @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations * @f_op: file operations
@ -1031,7 +1032,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
*/ */
struct file { struct file {
atomic_long_t f_count; file_ref_t f_ref;
spinlock_t f_lock; spinlock_t f_lock;
fmode_t f_mode; fmode_t f_mode;
const struct file_operations *f_op; const struct file_operations *f_op;
@ -1079,15 +1080,14 @@ struct file_handle {
static inline struct file *get_file(struct file *f) static inline struct file *get_file(struct file *f)
{ {
long prior = atomic_long_fetch_inc_relaxed(&f->f_count); file_ref_inc(&f->f_ref);
WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
return f; return f;
} }
struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f); struct file *get_file_active(struct file **f);
#define file_count(x) atomic_long_read(&(x)->f_count) #define file_count(f) file_ref_read(&(f)->f_ref)
#define MAX_NON_LFS ((1UL<<31) - 1) #define MAX_NON_LFS ((1UL<<31) - 1)

View File

@ -51,7 +51,6 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/percpu.h> #include <linux/percpu.h>

View File

@ -16,7 +16,6 @@
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h> #include <linux/bpf_lsm.h>
#include <linux/btf_ids.h> #include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h> #include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache); DEFINE_BPF_STORAGE_CACHE(inode_cache);

View File

@ -16,7 +16,6 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#include <linux/btf_ids.h> #include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h> #include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache); DEFINE_BPF_STORAGE_CACHE(task_cache);

View File

@ -5,7 +5,6 @@
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/fdtable.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/bpf_mem_alloc.h> #include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h> #include <linux/btf_ids.h>
@ -286,17 +285,14 @@ again:
curr_fd = 0; curr_fd = 0;
} }
rcu_read_lock(); f = fget_task_next(curr_task, &curr_fd);
f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
if (f) { if (f) {
/* set info->fd */ /* set info->fd */
info->fd = curr_fd; info->fd = curr_fd;
rcu_read_unlock();
return f; return f;
} }
/* the current task is done, go to the next task */ /* the current task is done, go to the next task */
rcu_read_unlock();
put_task_struct(curr_task); put_task_struct(curr_task);
if (info->common.type == BPF_TASK_ITER_TID) { if (info->common.type == BPF_TASK_ITER_TID) {

View File

@ -1,6 +1,5 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/fdtable.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/kernel.h> #include <linux/kernel.h>

View File

@ -25,7 +25,6 @@
#include <linux/acct.h> #include <linux/acct.h>
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/nsproxy.h> #include <linux/nsproxy.h>

View File

@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{ {
struct file *file; struct file *file;
rcu_read_lock(); file = fget_task(task, idx);
file = task_lookup_fdget_rcu(task, idx);
rcu_read_unlock();
if (file) if (file)
fput(file); fput(file);

View File

@ -18,7 +18,6 @@
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/mount.h> #include <linux/mount.h>

View File

@ -15,7 +15,6 @@
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/mount.h> #include <linux/mount.h>

View File

@ -13,7 +13,6 @@
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs_struct.h> #include <linux/fs_struct.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/security.h> #include <linux/security.h>

View File

@ -13,7 +13,6 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/inet.h> #include <linux/inet.h>
#include <linux/fdtable.h>
#include <linux/rhashtable.h> #include <linux/rhashtable.h>
#include <net/sock.h> #include <net/sock.h>

View File

@ -9,7 +9,6 @@
*/ */
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/fdtable.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/mount.h> #include <linux/mount.h>

View File

@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
dnotify_test dnotify_test
devpts_pts devpts_pts
file_stressor

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
CFLAGS += $(KHDR_INCLUDES) CFLAGS += $(KHDR_INCLUDES)
TEST_GEN_PROGS := devpts_pts TEST_GEN_PROGS := devpts_pts file_stressor
TEST_GEN_PROGS_EXTENDED := dnotify_test TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk include ../lib.mk

View File

@ -0,0 +1,194 @@
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#define __SANE_USERSPACE_TYPES__
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <unistd.h>
#include "../kselftest_harness.h"
#include <linux/types.h>
#include <linux/mount.h>
#include <sys/syscall.h>
static inline int sys_fsopen(const char *fsname, unsigned int flags)
{
return syscall(__NR_fsopen, fsname, flags);
}
static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
const char *value, int aux)
{
return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
}
static inline int sys_fsmount(int fd, unsigned int flags,
unsigned int attr_flags)
{
return syscall(__NR_fsmount, fd, flags, attr_flags);
}
#ifndef MOVE_MOUNT_F_EMPTY_PATH
#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
#endif
static inline int sys_move_mount(int from_dfd, const char *from_pathname,
int to_dfd, const char *to_pathname,
unsigned int flags)
{
return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
to_pathname, flags);
}
FIXTURE(file_stressor) {
int fd_tmpfs;
int nr_procs;
int max_fds;
pid_t *pids_openers;
pid_t *pids_getdents;
int *fd_proc_pid;
};
FIXTURE_SETUP(file_stressor)
{
int fd_context;
ASSERT_EQ(unshare(CLONE_NEWNS), 0);
ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0);
fd_context = sys_fsopen("tmpfs", 0);
ASSERT_GE(fd_context, 0);
ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
self->fd_tmpfs = sys_fsmount(fd_context, 0, 0);
ASSERT_GE(self->fd_tmpfs, 0);
ASSERT_EQ(close(fd_context), 0);
ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0);
self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN);
self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs);
ASSERT_NE(self->pids_openers, NULL);
self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs);
ASSERT_NE(self->pids_getdents, NULL);
self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs);
ASSERT_NE(self->fd_proc_pid, NULL);
self->max_fds = 500;
}
FIXTURE_TEARDOWN(file_stressor)
{
for (int i = 0; i < self->nr_procs; i++) {
int wstatus;
pid_t pid;
pid = waitpid(self->pids_openers[i], &wstatus, 0);
ASSERT_EQ(pid, self->pids_openers[i]);
ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
pid = waitpid(self->pids_getdents[i], &wstatus, 0);
ASSERT_EQ(pid, self->pids_getdents[i]);
ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
}
free(self->pids_openers);
free(self->pids_getdents);
ASSERT_EQ(close(self->fd_tmpfs), 0);
umount2("/slab_typesafe_by_rcu", 0);
ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0);
}
TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2)
{
for (int i = 0; i < self->nr_procs; i++) {
pid_t pid_self;
self->pids_openers[i] = fork();
ASSERT_GE(self->pids_openers[i], 0);
if (self->pids_openers[i] != 0)
continue;
self->pids_openers[i] = getpid();
for (;;) {
for (int i = 0; i < self->max_fds; i++) {
char path[PATH_MAX];
int fd;
sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i);
fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644);
if (fd < 0)
continue;
}
close_range(3, ~0U, 0);
}
exit(0);
}
for (int i = 0; i < self->nr_procs; i++) {
char path[PATH_MAX];
sprintf(path, "/proc/%d/fd/", self->pids_openers[i]);
self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
ASSERT_GE(self->fd_proc_pid[i], 0);
}
for (int i = 0; i < self->nr_procs; i++) {
self->pids_getdents[i] = fork();
ASSERT_GE(self->pids_getdents[i], 0);
if (self->pids_getdents[i] != 0)
continue;
self->pids_getdents[i] = getpid();
for (;;) {
char ents[1024];
ssize_t nr_read;
/*
* Concurrently read /proc/<pid>/fd/ which rougly does:
*
* f = fget_task_next(p, &fd);
* if (!f)
* break;
* data.mode = f->f_mode;
* fput(f);
*
* Which means that it'll try to get a reference to a
* file in another task's file descriptor table.
*
* Under heavy file load it is increasingly likely that
* the other task will manage to close @file and @file
* is being recycled due to SLAB_TYPEAFE_BY_RCU
* concurrently. This will trigger various warnings in
* the file reference counting code.
*/
do {
nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents));
} while (nr_read >= 0);
lseek(self->fd_proc_pid[i], 0, SEEK_SET);
}
exit(0);
}
ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0);
for (int i = 0; i < self->nr_procs; i++) {
kill(self->pids_openers[i], SIGKILL);
kill(self->pids_getdents[i], SIGKILL);
}
}
TEST_HARNESS_MAIN