2019-05-19 12:08:55 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* linux/fs/file_table.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
|
|
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/file.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/fs.h>
|
2022-11-20 14:15:34 +00:00
|
|
|
#include <linux/filelock.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/security.h>
|
2017-02-02 16:54:15 +00:00
|
|
|
#include <linux/cred.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/eventpoll.h>
|
2005-09-09 20:04:13 +00:00
|
|
|
#include <linux/rcupdate.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/mount.h>
|
2006-01-11 20:17:46 +00:00
|
|
|
#include <linux/capability.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
#include <linux/cdev.h>
|
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-12 21:06:03 +00:00
|
|
|
#include <linux/fsnotify.h>
|
2006-03-08 05:55:35 +00:00
|
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <linux/percpu_counter.h>
|
fs: scale files_lock
fs: scale files_lock
Improve scalability of files_lock by adding per-cpu, per-sb files lists,
protected with an lglock. The lglock provides fast access to the per-cpu lists
to add and remove files. It also provides a snapshot of all the per-cpu lists
(although this is very slow).
One difficulty with this approach is that a file can be removed from the list
by another CPU. We must track which per-cpu list the file is on with a new
variale in the file struct (packed into a hole on 64-bit archs). Scalability
could suffer if files are frequently removed from different cpu's list.
However loads with frequent removal of files imply short interval between
adding and removing the files, and the scheduler attempts to avoid moving
processes too far away. Also, even in the case of cross-CPU removal, the
hardware has much more opportunity to parallelise cacheline transfers with N
cachelines than with 1.
A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs
degenerates to contending on a single lock, which is no worse than before. When
more than one CPU are allocating files, even if they are always freed by
different CPUs, there will be more parallelism than the single-lock case.
Testing results:
On a 2 socket, 8 core opteron, I measure the number of times the lock is taken
to remove the file, the number of times it is removed by the same CPU that
added it, and the number of times it is removed by the same node that added it.
Booting: locks= 25049 cpu-hits= 23174 (92.5%) node-hits= 23945 (95.6%)
kbuild -j16 locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%)
dbench 64 locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%)
So a file is removed from the same CPU it was added by over 90% of the time.
It remains within the same node 95% of the time.
Tim Chen ran some numbers for a 64 thread Nehalem system performing a compile.
throughput
2.6.34-rc2 24.5
+patch 24.9
us sys idle IO wait (in %)
2.6.34-rc2 51.25 28.25 17.25 3.25
+patch 53.75 18.5 19 8.75
So significantly less CPU time spent in kernel code, higher idle time and
slightly higher throughput.
Single threaded performance difference was within the noise of microbenchmarks.
That is not to say penalty does not exist, the code is larger and more memory
accesses required so it will be slightly slower.
Cc: linux-kernel@vger.kernel.org
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2010-08-17 18:37:38 +00:00
|
|
|
#include <linux/percpu.h>
|
2012-06-24 05:56:45 +00:00
|
|
|
#include <linux/task_work.h>
|
2015-08-06 22:46:20 +00:00
|
|
|
#include <linux/swap.h>
|
2022-02-15 02:08:28 +00:00
|
|
|
#include <linux/kmemleak.h>
|
2006-03-08 05:55:35 +00:00
|
|
|
|
2011-07-26 23:09:06 +00:00
|
|
|
#include <linux/atomic.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2009-12-04 20:47:36 +00:00
|
|
|
#include "internal.h"
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/* sysctl tunables... */
|
2022-01-22 06:12:56 +00:00
|
|
|
static struct files_stat_struct files_stat = {
|
2005-04-16 22:20:36 +00:00
|
|
|
.max_files = NR_FILE
|
|
|
|
};
|
|
|
|
|
2008-12-10 17:35:45 +00:00
|
|
|
/* SLAB cache for file structures */
|
2023-10-11 16:55:00 +00:00
|
|
|
static struct kmem_cache *filp_cachep __ro_after_init;
|
2024-10-07 14:23:57 +00:00
|
|
|
static struct kmem_cache *bfilp_cachep __ro_after_init;
|
2008-12-10 17:35:45 +00:00
|
|
|
|
2006-03-08 05:55:35 +00:00
|
|
|
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2023-10-09 15:37:12 +00:00
|
|
|
/* Container for backing file with optional user path */
|
2023-06-15 11:22:28 +00:00
|
|
|
struct backing_file {
|
|
|
|
struct file file;
|
2024-10-07 14:23:57 +00:00
|
|
|
union {
|
|
|
|
struct path user_path;
|
|
|
|
freeptr_t bf_freeptr;
|
|
|
|
};
|
2023-06-15 11:22:28 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct backing_file *backing_file(struct file *f)
|
|
|
|
{
|
|
|
|
return container_of(f, struct backing_file, file);
|
|
|
|
}
|
|
|
|
|
2023-10-09 15:37:12 +00:00
|
|
|
struct path *backing_file_user_path(struct file *f)
|
2023-06-15 11:22:28 +00:00
|
|
|
{
|
2023-10-09 15:37:12 +00:00
|
|
|
return &backing_file(f)->user_path;
|
2023-06-15 11:22:28 +00:00
|
|
|
}
|
2023-10-09 15:37:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(backing_file_user_path);
|
2023-06-15 11:22:28 +00:00
|
|
|
|
2006-03-08 05:55:35 +00:00
|
|
|
static inline void file_free(struct file *f)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2018-07-09 15:24:21 +00:00
|
|
|
security_file_free(f);
|
2023-06-15 11:22:28 +00:00
|
|
|
if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
|
2018-07-18 13:44:40 +00:00
|
|
|
percpu_counter_dec(&nr_files);
|
2023-09-29 06:45:59 +00:00
|
|
|
put_cred(f->f_cred);
|
|
|
|
if (unlikely(f->f_mode & FMODE_BACKING)) {
|
2023-10-09 15:37:12 +00:00
|
|
|
path_put(backing_file_user_path(f));
|
2024-10-07 14:23:57 +00:00
|
|
|
kmem_cache_free(bfilp_cachep, backing_file(f));
|
2023-09-29 06:45:59 +00:00
|
|
|
} else {
|
|
|
|
kmem_cache_free(filp_cachep, f);
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2006-03-08 05:55:35 +00:00
|
|
|
/*
|
|
|
|
* Return the total number of open files in the system
|
|
|
|
*/
|
2010-10-26 21:22:44 +00:00
|
|
|
static long get_nr_files(void)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2006-03-08 05:55:35 +00:00
|
|
|
return percpu_counter_read_positive(&nr_files);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2006-03-08 05:55:35 +00:00
|
|
|
/*
|
|
|
|
* Return the maximum number of open files in the system
|
|
|
|
*/
|
2010-10-26 21:22:44 +00:00
|
|
|
unsigned long get_max_files(void)
|
2005-09-09 20:04:13 +00:00
|
|
|
{
|
2006-03-08 05:55:35 +00:00
|
|
|
return files_stat.max_files;
|
2005-09-09 20:04:13 +00:00
|
|
|
}
|
2006-03-08 05:55:35 +00:00
|
|
|
EXPORT_SYMBOL_GPL(get_max_files);
|
|
|
|
|
2022-01-22 06:12:56 +00:00
|
|
|
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
|
|
|
|
|
2006-03-08 05:55:35 +00:00
|
|
|
/*
|
|
|
|
* Handle nr_files sysctl
|
|
|
|
*/
|
sysctl: treewide: constify the ctl_table argument of proc_handlers
const qualify the struct ctl_table argument in the proc_handler function
signatures. This is a prerequisite to moving the static ctl_table
structs into .rodata data which will ensure that proc_handler function
pointers cannot be modified.
This patch has been generated by the following coccinelle script:
```
virtual patch
@r1@
identifier ctl, write, buffer, lenp, ppos;
identifier func !~ "appldata_(timer|interval)_handler|sched_(rt|rr)_handler|rds_tcp_skbuf_handler|proc_sctp_do_(hmac_alg|rto_min|rto_max|udp_port|alpha_beta|auth|probe_interval)";
@@
int func(
- struct ctl_table *ctl
+ const struct ctl_table *ctl
,int write, void *buffer, size_t *lenp, loff_t *ppos);
@r2@
identifier func, ctl, write, buffer, lenp, ppos;
@@
int func(
- struct ctl_table *ctl
+ const struct ctl_table *ctl
,int write, void *buffer, size_t *lenp, loff_t *ppos)
{ ... }
@r3@
identifier func;
@@
int func(
- struct ctl_table *
+ const struct ctl_table *
,int , void *, size_t *, loff_t *);
@r4@
identifier func, ctl;
@@
int func(
- struct ctl_table *ctl
+ const struct ctl_table *ctl
,int , void *, size_t *, loff_t *);
@r5@
identifier func, write, buffer, lenp, ppos;
@@
int func(
- struct ctl_table *
+ const struct ctl_table *
,int write, void *buffer, size_t *lenp, loff_t *ppos);
```
* Code formatting was adjusted in xfs_sysctl.c to comply with code
conventions. The xfs_stats_clear_proc_handler,
xfs_panic_mask_proc_handler and xfs_deprecated_dointvec_minmax where
adjusted.
* The ctl_table argument in proc_watchdog_common was const qualified.
This is called from a proc_handler itself and is calling back into
another proc_handler, making it necessary to change it as part of the
proc_handler migration.
Co-developed-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Co-developed-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: Joel Granados <j.granados@samsung.com>
2024-07-24 18:59:29 +00:00
|
|
|
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
|
2022-01-22 06:12:56 +00:00
|
|
|
size_t *lenp, loff_t *ppos)
|
2006-03-08 05:55:35 +00:00
|
|
|
{
|
|
|
|
files_stat.nr_files = get_nr_files();
|
2010-10-26 21:22:44 +00:00
|
|
|
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
2006-03-08 05:55:35 +00:00
|
|
|
}
|
2022-01-22 06:12:56 +00:00
|
|
|
|
|
|
|
static struct ctl_table fs_stat_sysctls[] = {
|
|
|
|
{
|
|
|
|
.procname = "file-nr",
|
|
|
|
.data = &files_stat,
|
|
|
|
.maxlen = sizeof(files_stat),
|
|
|
|
.mode = 0444,
|
|
|
|
.proc_handler = proc_nr_files,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "file-max",
|
|
|
|
.data = &files_stat.max_files,
|
|
|
|
.maxlen = sizeof(files_stat.max_files),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = proc_doulongvec_minmax,
|
|
|
|
.extra1 = SYSCTL_LONG_ZERO,
|
|
|
|
.extra2 = SYSCTL_LONG_MAX,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "nr_open",
|
|
|
|
.data = &sysctl_nr_open,
|
|
|
|
.maxlen = sizeof(unsigned int),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
.extra1 = &sysctl_nr_open_min,
|
|
|
|
.extra2 = &sysctl_nr_open_max,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init init_fs_stat_sysctls(void)
|
2006-03-08 05:55:35 +00:00
|
|
|
{
|
2022-01-22 06:12:56 +00:00
|
|
|
register_sysctl_init("fs", fs_stat_sysctls);
|
2022-02-15 02:08:28 +00:00
|
|
|
if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
|
|
|
|
struct ctl_table_header *hdr;
|
2024-07-27 07:21:34 +00:00
|
|
|
|
2022-02-15 02:08:28 +00:00
|
|
|
hdr = register_sysctl_mount_point("fs/binfmt_misc");
|
|
|
|
kmemleak_not_leak(hdr);
|
|
|
|
}
|
2022-01-22 06:12:56 +00:00
|
|
|
return 0;
|
2006-03-08 05:55:35 +00:00
|
|
|
}
|
2022-01-22 06:12:56 +00:00
|
|
|
fs_initcall(init_fs_stat_sysctls);
|
2006-03-08 05:55:35 +00:00
|
|
|
#endif
|
2005-09-09 20:04:13 +00:00
|
|
|
|
2023-06-15 11:22:27 +00:00
|
|
|
static int init_file(struct file *f, int flags, const struct cred *cred)
|
2018-07-18 13:44:40 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
f->f_cred = get_cred(cred);
|
|
|
|
error = security_file_alloc(f);
|
|
|
|
if (unlikely(error)) {
|
2023-07-01 17:11:34 +00:00
|
|
|
put_cred(f->f_cred);
|
2023-06-15 11:22:27 +00:00
|
|
|
return error;
|
2018-07-18 13:44:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock_init(&f->f_lock);
|
2024-08-30 13:04:59 +00:00
|
|
|
/*
|
|
|
|
* Note that f_pos_lock is only used for files raising
|
|
|
|
* FMODE_ATOMIC_POS and directories. Other files such as pipes
|
|
|
|
* don't need it and since f_pos_lock is in a union may reuse
|
|
|
|
* the space for other purposes. They are expected to initialize
|
|
|
|
* the respective member when opening the file.
|
|
|
|
*/
|
2018-07-18 13:44:40 +00:00
|
|
|
mutex_init(&f->f_pos_lock);
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
memset(&f->f_path, 0, sizeof(f->f_path));
|
|
|
|
memset(&f->f_ra, 0, sizeof(f->f_ra));
|
|
|
|
|
|
|
|
f->f_flags = flags;
|
|
|
|
f->f_mode = OPEN_FMODE(flags);
|
|
|
|
|
|
|
|
f->f_op = NULL;
|
|
|
|
f->f_mapping = NULL;
|
|
|
|
f->private_data = NULL;
|
|
|
|
f->f_inode = NULL;
|
|
|
|
f->f_owner = NULL;
|
|
|
|
#ifdef CONFIG_EPOLL
|
|
|
|
f->f_ep = NULL;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
f->f_iocb_flags = 0;
|
|
|
|
f->f_pos = 0;
|
|
|
|
f->f_wb_err = 0;
|
|
|
|
f->f_sb_err = 0;
|
2018-07-18 13:44:40 +00:00
|
|
|
|
2023-09-29 06:45:59 +00:00
|
|
|
/*
|
|
|
|
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
|
|
|
|
* fget-rcu pattern users need to be able to handle spurious
|
|
|
|
* refcount bumps we should reinitialize the reused file first.
|
|
|
|
*/
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
file_ref_init(&f->f_ref, 1);
|
2023-06-15 11:22:27 +00:00
|
|
|
return 0;
|
2018-07-18 13:44:40 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/* Find an unused file structure and return a pointer to it.
|
2013-02-15 01:41:04 +00:00
|
|
|
* Returns an error pointer if some error happend e.g. we over file
|
|
|
|
* structures limit, run out of memory or operation is not permitted.
|
2008-02-15 22:37:26 +00:00
|
|
|
*
|
|
|
|
* Be very careful using this. You are responsible for
|
|
|
|
* getting write access to any mount that you might assign
|
|
|
|
* to this filp, if it is opened for write. If this is not
|
|
|
|
* done, you will imbalance int the mount's writer count
|
|
|
|
* and a warning at __fput() time.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2018-07-11 19:00:04 +00:00
|
|
|
struct file *alloc_empty_file(int flags, const struct cred *cred)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2010-10-26 21:22:44 +00:00
|
|
|
static long old_max;
|
2013-02-15 01:41:04 +00:00
|
|
|
struct file *f;
|
2023-06-15 11:22:27 +00:00
|
|
|
int error;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Privileged users can go above max_files
|
|
|
|
*/
|
2006-03-08 05:55:35 +00:00
|
|
|
if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
|
|
|
|
/*
|
|
|
|
* percpu_counters are inaccurate. Do an expensive check before
|
|
|
|
* we go and fail.
|
|
|
|
*/
|
2007-10-17 06:25:44 +00:00
|
|
|
if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
|
2006-03-08 05:55:35 +00:00
|
|
|
goto over;
|
|
|
|
}
|
2005-06-23 07:09:50 +00:00
|
|
|
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
|
2023-06-15 11:22:27 +00:00
|
|
|
if (unlikely(!f))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
error = init_file(f, flags, cred);
|
2023-07-01 17:11:34 +00:00
|
|
|
if (unlikely(error)) {
|
|
|
|
kmem_cache_free(filp_cachep, f);
|
2023-06-15 11:22:27 +00:00
|
|
|
return ERR_PTR(error);
|
2023-07-01 17:11:34 +00:00
|
|
|
}
|
2023-06-15 11:22:27 +00:00
|
|
|
|
|
|
|
percpu_counter_inc(&nr_files);
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2005-06-23 07:09:50 +00:00
|
|
|
return f;
|
|
|
|
|
|
|
|
over:
|
2005-04-16 22:20:36 +00:00
|
|
|
/* Ran out of filps - report that */
|
2006-03-08 05:55:35 +00:00
|
|
|
if (get_nr_files() > old_max) {
|
2010-10-26 21:22:44 +00:00
|
|
|
pr_info("VFS: file-max limit %lu reached\n", get_max_files());
|
2006-03-08 05:55:35 +00:00
|
|
|
old_max = get_nr_files();
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2013-02-15 01:41:04 +00:00
|
|
|
return ERR_PTR(-ENFILE);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2018-07-18 13:44:40 +00:00
|
|
|
/*
|
|
|
|
* Variant of alloc_empty_file() that doesn't check and modify nr_files.
|
|
|
|
*
|
2023-06-15 11:22:27 +00:00
|
|
|
* This is only for kernel internal use, and the allocate file must not be
|
|
|
|
* installed into file tables or such.
|
2018-07-18 13:44:40 +00:00
|
|
|
*/
|
|
|
|
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
|
|
|
|
{
|
2023-06-15 11:22:27 +00:00
|
|
|
struct file *f;
|
|
|
|
int error;
|
|
|
|
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
|
2023-06-15 11:22:27 +00:00
|
|
|
if (unlikely(!f))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
error = init_file(f, flags, cred);
|
2023-07-01 17:11:34 +00:00
|
|
|
if (unlikely(error)) {
|
|
|
|
kmem_cache_free(filp_cachep, f);
|
2023-06-15 11:22:27 +00:00
|
|
|
return ERR_PTR(error);
|
2023-07-01 17:11:34 +00:00
|
|
|
}
|
2018-07-18 13:44:40 +00:00
|
|
|
|
2023-06-15 11:22:27 +00:00
|
|
|
f->f_mode |= FMODE_NOACCOUNT;
|
2018-07-18 13:44:40 +00:00
|
|
|
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
2023-06-15 11:22:28 +00:00
|
|
|
/*
|
|
|
|
* Variant of alloc_empty_file() that allocates a backing_file container
|
|
|
|
* and doesn't check and modify nr_files.
|
|
|
|
*
|
|
|
|
* This is only for kernel internal use, and the allocate file must not be
|
|
|
|
* installed into file tables or such.
|
|
|
|
*/
|
|
|
|
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
|
|
|
|
{
|
|
|
|
struct backing_file *ff;
|
|
|
|
int error;
|
|
|
|
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
|
2023-06-15 11:22:28 +00:00
|
|
|
if (unlikely(!ff))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
error = init_file(&ff->file, flags, cred);
|
2023-07-01 17:11:34 +00:00
|
|
|
if (unlikely(error)) {
|
2024-10-07 14:23:57 +00:00
|
|
|
kmem_cache_free(bfilp_cachep, ff);
|
2023-06-15 11:22:28 +00:00
|
|
|
return ERR_PTR(error);
|
2023-07-01 17:11:34 +00:00
|
|
|
}
|
2023-06-15 11:22:28 +00:00
|
|
|
|
|
|
|
ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
|
|
|
|
return &ff->file;
|
|
|
|
}
|
|
|
|
|
2007-10-17 06:31:13 +00:00
|
|
|
/**
|
2024-02-08 18:08:37 +00:00
|
|
|
* file_init_path - initialize a 'struct file' based on path
|
2014-10-12 19:29:29 +00:00
|
|
|
*
|
2024-02-08 18:08:37 +00:00
|
|
|
* @file: the file to set up
|
2014-10-12 19:29:29 +00:00
|
|
|
* @path: the (dentry, vfsmount) pair for the new file
|
2007-10-17 06:31:13 +00:00
|
|
|
* @fop: the 'struct file_operations' for the new file
|
|
|
|
*/
|
2024-02-08 18:08:37 +00:00
|
|
|
static void file_init_path(struct file *file, const struct path *path,
|
|
|
|
const struct file_operations *fop)
|
2007-10-17 06:31:13 +00:00
|
|
|
{
|
2009-08-08 20:52:35 +00:00
|
|
|
file->f_path = *path;
|
2013-03-02 00:48:30 +00:00
|
|
|
file->f_inode = path->dentry->d_inode;
|
2009-08-08 20:52:35 +00:00
|
|
|
file->f_mapping = path->dentry->d_inode->i_mapping;
|
fs: new infrastructure for writeback error handling and reporting
Most filesystems currently use mapping_set_error and
filemap_check_errors for setting and reporting/clearing writeback errors
at the mapping level. filemap_check_errors is indirectly called from
most of the filemap_fdatawait_* functions and from
filemap_write_and_wait*. These functions are called from all sorts of
contexts to wait on writeback to finish -- e.g. mostly in fsync, but
also in truncate calls, getattr, etc.
The non-fsync callers are problematic. We should be reporting writeback
errors during fsync, but many places spread over the tree clear out
errors before they can be properly reported, or report errors at
nonsensical times.
If I get -EIO on a stat() call, there is no reason for me to assume that
it is because some previous writeback failed. The fact that it also
clears out the error such that a subsequent fsync returns 0 is a bug,
and a nasty one since that's potentially silent data corruption.
This patch adds a small bit of new infrastructure for setting and
reporting errors during address_space writeback. While the above was my
original impetus for adding this, I think it's also the case that
current fsync semantics are just problematic for userland. Most
applications that call fsync do so to ensure that the data they wrote
has hit the backing store.
In the case where there are multiple writers to the file at the same
time, this is really hard to determine. The first one to call fsync will
see any stored error, and the rest get back 0. The processes with open
fds may not be associated with one another in any way. They could even
be in different containers, so ensuring coordination between all fsync
callers is not really an option.
One way to remedy this would be to track what file descriptor was used
to dirty the file, but that's rather cumbersome and would likely be
slow. However, there is a simpler way to improve the semantics here
without incurring too much overhead.
This set adds an errseq_t to struct address_space, and a corresponding
one is added to struct file. Writeback errors are recorded in the
mapping's errseq_t, and the one in struct file is used as the "since"
value.
This changes the semantics of the Linux fsync implementation such that
applications can now use it to determine whether there were any
writeback errors since fsync(fd) was last called (or since the file was
opened in the case of fsync having never been called).
Note that those writeback errors may have occurred when writing data
that was dirtied via an entirely different fd, but that's the case now
with the current mapping_set_error/filemap_check_error infrastructure.
This will at least prevent you from getting a false report of success.
The new behavior is still consistent with the POSIX spec, and is more
reliable for application developers. This patch just adds some basic
infrastructure for doing this, and ensures that the f_wb_err "cursor"
is properly set when a file is opened. Later patches will change the
existing code to use this new infrastructure for reporting errors at
fsync time.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
2017-07-06 11:02:25 +00:00
|
|
|
file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
|
vfs: track per-sb writeback errors and report them to syncfs
Patch series "vfs: have syncfs() return error when there are writeback
errors", v6.
Currently, syncfs does not return errors when one of the inodes fails to
be written back. It will return errors based on the legacy AS_EIO and
AS_ENOSPC flags when syncing out the block device fails, but that's not
particularly helpful for filesystems that aren't backed by a blockdev.
It's also possible for a stray sync to lose those errors.
The basic idea in this set is to track writeback errors at the
superblock level, so that we can quickly and easily check whether
something bad happened without having to fsync each file individually.
syncfs is then changed to reliably report writeback errors after they
occur, much in the same fashion as fsync does now.
This patch (of 2):
Usually we suggest that applications call fsync when they want to ensure
that all data written to the file has made it to the backing store, but
that can be inefficient when there are a lot of open files.
Calling syncfs on the filesystem can be more efficient in some
situations, but the error reporting doesn't currently work the way most
people expect. If a single inode on a filesystem reports a writeback
error, syncfs won't necessarily return an error. syncfs only returns an
error if __sync_blockdev fails, and on some filesystems that's a no-op.
It would be better if syncfs reported an error if there were any
writeback failures. Then applications could call syncfs to see if there
are any errors on any open files, and could then call fsync on all of
the other descriptors to figure out which one failed.
This patch adds a new errseq_t to struct super_block, and has
mapping_set_error also record writeback errors there.
To report those errors, we also need to keep an errseq_t in struct file
to act as a cursor. This patch adds a dedicated field for that purpose,
which slots nicely into 4 bytes of padding at the end of struct file on
x86_64.
An earlier version of this patch used an O_PATH file descriptor to cue
the kernel that the open file should track the superblock error and not
the inode's writeback error.
I think that API is just too weird though. This is simpler and should
make syncfs error reporting "just work" even if someone is multiplexing
fsync and syncfs on the same fds.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andres Freund <andres@anarazel.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20200428135155.19223-1-jlayton@kernel.org
Link: http://lkml.kernel.org/r/20200428135155.19223-2-jlayton@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 04:45:36 +00:00
|
|
|
file->f_sb_err = file_sample_sb_err(file);
|
2022-06-29 13:07:00 +00:00
|
|
|
if (fop->llseek)
|
2022-06-29 13:06:57 +00:00
|
|
|
file->f_mode |= FMODE_LSEEK;
|
2018-07-11 18:19:04 +00:00
|
|
|
if ((file->f_mode & FMODE_READ) &&
|
2015-04-04 05:14:53 +00:00
|
|
|
likely(fop->read || fop->read_iter))
|
2018-07-11 18:19:04 +00:00
|
|
|
file->f_mode |= FMODE_CAN_READ;
|
|
|
|
if ((file->f_mode & FMODE_WRITE) &&
|
2015-04-04 05:14:53 +00:00
|
|
|
likely(fop->write || fop->write_iter))
|
2018-07-11 18:19:04 +00:00
|
|
|
file->f_mode |= FMODE_CAN_WRITE;
|
2022-05-22 15:38:11 +00:00
|
|
|
file->f_iocb_flags = iocb_flags(file);
|
2018-07-09 06:35:08 +00:00
|
|
|
file->f_mode |= FMODE_OPENED;
|
2007-10-17 06:31:13 +00:00
|
|
|
file->f_op = fop;
|
2018-07-11 18:19:04 +00:00
|
|
|
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
|
2010-11-02 14:13:07 +00:00
|
|
|
i_readcount_inc(path->dentry->d_inode);
|
2024-02-08 18:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* alloc_file - allocate and initialize a 'struct file'
|
|
|
|
*
|
|
|
|
* @path: the (dentry, vfsmount) pair for the new file
|
|
|
|
* @flags: O_... flags with which the new file will be opened
|
|
|
|
* @fop: the 'struct file_operations' for the new file
|
|
|
|
*/
|
|
|
|
static struct file *alloc_file(const struct path *path, int flags,
|
|
|
|
const struct file_operations *fop)
|
|
|
|
{
|
|
|
|
struct file *file;
|
|
|
|
|
|
|
|
file = alloc_empty_file(flags, current_cred());
|
|
|
|
if (!IS_ERR(file))
|
|
|
|
file_init_path(file, path, fop);
|
2009-08-08 19:56:29 +00:00
|
|
|
return file;
|
2007-10-17 06:31:13 +00:00
|
|
|
}
|
|
|
|
|
2024-02-08 18:08:37 +00:00
|
|
|
static inline int alloc_path_pseudo(const char *name, struct inode *inode,
|
|
|
|
struct vfsmount *mnt, struct path *path)
|
2018-06-09 13:40:05 +00:00
|
|
|
{
|
|
|
|
struct qstr this = QSTR_INIT(name, strlen(name));
|
2024-02-08 18:08:37 +00:00
|
|
|
|
|
|
|
path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
|
|
|
|
if (!path->dentry)
|
|
|
|
return -ENOMEM;
|
|
|
|
path->mnt = mntget(mnt);
|
|
|
|
d_instantiate(path->dentry, inode);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
|
|
|
|
const char *name, int flags,
|
|
|
|
const struct file_operations *fops)
|
|
|
|
{
|
|
|
|
int ret;
|
2018-06-09 13:40:05 +00:00
|
|
|
struct path path;
|
|
|
|
struct file *file;
|
|
|
|
|
2024-02-08 18:08:37 +00:00
|
|
|
ret = alloc_path_pseudo(name, inode, mnt, &path);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
2020-06-29 14:41:45 +00:00
|
|
|
file = alloc_file(&path, flags, fops);
|
2018-06-09 13:40:05 +00:00
|
|
|
if (IS_ERR(file)) {
|
|
|
|
ihold(inode);
|
|
|
|
path_put(&path);
|
|
|
|
}
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(alloc_file_pseudo);
|
|
|
|
|
2024-02-08 18:10:45 +00:00
|
|
|
struct file *alloc_file_pseudo_noaccount(struct inode *inode,
|
|
|
|
struct vfsmount *mnt, const char *name,
|
|
|
|
int flags,
|
|
|
|
const struct file_operations *fops)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct path path;
|
|
|
|
struct file *file;
|
|
|
|
|
|
|
|
ret = alloc_path_pseudo(name, inode, mnt, &path);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
file = alloc_empty_file_noaccount(flags, current_cred());
|
|
|
|
if (IS_ERR(file)) {
|
|
|
|
ihold(inode);
|
|
|
|
path_put(&path);
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
file_init_path(file, &path, fops);
|
|
|
|
return file;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
|
|
|
|
|
2018-06-17 18:15:10 +00:00
|
|
|
struct file *alloc_file_clone(struct file *base, int flags,
|
|
|
|
const struct file_operations *fops)
|
|
|
|
{
|
2024-07-27 07:21:34 +00:00
|
|
|
struct file *f;
|
|
|
|
|
|
|
|
f = alloc_file(&base->f_path, flags, fops);
|
2018-06-17 18:15:10 +00:00
|
|
|
if (!IS_ERR(f)) {
|
|
|
|
path_get(&f->f_path);
|
|
|
|
f->f_mapping = base->f_mapping;
|
|
|
|
}
|
|
|
|
return f;
|
|
|
|
}
|
|
|
|
|
2010-05-26 19:13:55 +00:00
|
|
|
/* the real guts of fput() - releasing the last reference to file
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2010-05-26 19:13:55 +00:00
|
|
|
static void __fput(struct file *file)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2006-12-08 10:36:35 +00:00
|
|
|
struct dentry *dentry = file->f_path.dentry;
|
|
|
|
struct vfsmount *mnt = file->f_path.mnt;
|
2013-06-13 22:37:49 +00:00
|
|
|
struct inode *inode = file->f_inode;
|
2018-11-05 17:40:30 +00:00
|
|
|
fmode_t mode = file->f_mode;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2018-07-09 15:14:39 +00:00
|
|
|
if (unlikely(!(file->f_mode & FMODE_OPENED)))
|
|
|
|
goto out;
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
might_sleep();
|
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-12 21:06:03 +00:00
|
|
|
|
|
|
|
fsnotify_close(file);
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* The function eventpoll_release() should be the first called
|
|
|
|
* in the file cleanup chain.
|
|
|
|
*/
|
|
|
|
eventpoll_release(file);
|
2014-02-03 17:13:08 +00:00
|
|
|
locks_remove_file(file);
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2024-02-15 10:31:01 +00:00
|
|
|
security_file_release(file);
|
2008-10-31 23:28:30 +00:00
|
|
|
if (unlikely(file->f_flags & FASYNC)) {
|
2013-09-22 20:27:52 +00:00
|
|
|
if (file->f_op->fasync)
|
2008-10-31 23:28:30 +00:00
|
|
|
file->f_op->fasync(-1, file, 0);
|
|
|
|
}
|
2013-09-22 20:27:52 +00:00
|
|
|
if (file->f_op->release)
|
2005-04-16 22:20:36 +00:00
|
|
|
file->f_op->release(inode, file);
|
2011-03-16 17:17:54 +00:00
|
|
|
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
|
2018-11-05 17:40:30 +00:00
|
|
|
!(mode & FMODE_PATH))) {
|
2005-04-16 22:20:36 +00:00
|
|
|
cdev_put(inode->i_cdev);
|
2011-03-16 17:17:54 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
fops_put(file->f_op);
|
2024-08-09 16:00:01 +00:00
|
|
|
file_f_owner_release(file);
|
2022-08-16 14:53:17 +00:00
|
|
|
put_file_access(file);
|
2005-04-16 22:20:36 +00:00
|
|
|
dput(dentry);
|
2018-11-05 17:40:30 +00:00
|
|
|
if (unlikely(mode & FMODE_NEED_UNMOUNT))
|
|
|
|
dissolve_on_fput(mnt);
|
2005-04-16 22:20:36 +00:00
|
|
|
mntput(mnt);
|
2018-07-09 15:14:39 +00:00
|
|
|
out:
|
|
|
|
file_free(file);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2013-07-08 21:24:16 +00:00
|
|
|
static LLIST_HEAD(delayed_fput_list);
|
2012-06-24 05:56:45 +00:00
|
|
|
static void delayed_fput(struct work_struct *unused)
|
|
|
|
{
|
2013-07-08 21:24:16 +00:00
|
|
|
struct llist_node *node = llist_del_all(&delayed_fput_list);
|
2017-08-07 08:45:39 +00:00
|
|
|
struct file *f, *t;
|
2013-07-08 21:24:16 +00:00
|
|
|
|
2022-05-22 13:28:12 +00:00
|
|
|
llist_for_each_entry_safe(f, t, node, f_llist)
|
2017-08-07 08:45:39 +00:00
|
|
|
__fput(f);
|
2012-06-24 05:56:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void ____fput(struct callback_head *work)
|
|
|
|
{
|
2023-11-30 12:49:09 +00:00
|
|
|
__fput(container_of(work, struct file, f_task_work));
|
2012-06-24 05:56:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If kernel thread really needs to have the final fput() it has done
|
|
|
|
* to complete, call this. The only user right now is the boot - we
|
|
|
|
* *do* need to make sure our writes to binaries on initramfs has
|
|
|
|
* not left us with opened struct file waiting for __fput() - execve()
|
|
|
|
* won't work without that. Please, don't add more callers without
|
|
|
|
* very good reasons; in particular, never call that with locks
|
|
|
|
* held and never call that from a thread that might need to do
|
|
|
|
* some work on any kind of umount.
|
|
|
|
*/
|
|
|
|
void flush_delayed_fput(void)
|
|
|
|
{
|
|
|
|
delayed_fput(NULL);
|
|
|
|
}
|
2019-08-18 18:18:47 +00:00
|
|
|
EXPORT_SYMBOL_GPL(flush_delayed_fput);
|
2012-06-24 05:56:45 +00:00
|
|
|
|
2013-10-20 12:44:39 +00:00
|
|
|
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
|
2012-06-24 05:56:45 +00:00
|
|
|
|
2021-11-02 02:46:48 +00:00
|
|
|
void fput(struct file *file)
|
2010-05-26 19:13:55 +00:00
|
|
|
{
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
if (file_ref_put(&file->f_ref)) {
|
2012-06-24 05:56:45 +00:00
|
|
|
struct task_struct *task = current;
|
2013-06-14 19:09:47 +00:00
|
|
|
|
2023-11-26 02:08:34 +00:00
|
|
|
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
|
|
|
|
file_free(file);
|
|
|
|
return;
|
|
|
|
}
|
2013-06-14 19:09:47 +00:00
|
|
|
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
|
2023-11-30 12:49:09 +00:00
|
|
|
init_task_work(&file->f_task_work, ____fput);
|
|
|
|
if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
|
2013-06-14 19:09:47 +00:00
|
|
|
return;
|
2013-07-08 21:24:15 +00:00
|
|
|
/*
|
|
|
|
* After this task has run exit_task_work(),
|
2013-09-11 21:24:34 +00:00
|
|
|
* task_work_add() will fail. Fall through to delayed
|
2013-07-08 21:24:15 +00:00
|
|
|
* fput to avoid leaking *file.
|
|
|
|
*/
|
2012-06-24 05:56:45 +00:00
|
|
|
}
|
2013-07-08 21:24:16 +00:00
|
|
|
|
2022-05-22 13:28:12 +00:00
|
|
|
if (llist_add(&file->f_llist, &delayed_fput_list))
|
2013-10-20 12:44:39 +00:00
|
|
|
schedule_delayed_work(&delayed_fput_work, 1);
|
2012-06-24 05:56:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* synchronous analog of fput(); for kernel threads that might be needed
|
|
|
|
* in some umount() (and thus can't use flush_delayed_fput() without
|
|
|
|
* risking deadlocks), need to wait for completion of __fput() and know
|
|
|
|
* for this specific struct file it won't involve anything that would
|
|
|
|
* need them. Use only if you really need it - at the very least,
|
|
|
|
* don't blindly convert fput() by kernel thread to that.
|
|
|
|
*/
|
|
|
|
void __fput_sync(struct file *file)
|
|
|
|
{
|
fs: port files to file_ref
Port files to rely on file_ref reference to improve scaling and gain
overflow protection.
- We continue to WARN during get_file() in case a file that is already
marked dead is revived as get_file() is only valid if the caller
already holds a reference to the file. This hasn't changed just the
check changes.
- The semantics for epoll and ttm's dmabuf usage have changed. Both
epoll and ttm synchronize with __fput() to prevent the underlying file
from beeing freed.
(1) epoll
Explaining epoll is straightforward using a simple diagram.
Essentially, the mutex of the epoll instance needs to be taken in both
__fput() and around epi_fget() preventing the file from being freed
while it is polled or preventing the file from being resurrected.
CPU1 CPU2
fput(file)
-> __fput(file)
-> eventpoll_release(file)
-> eventpoll_release_file(file)
mutex_lock(&ep->mtx)
epi_item_poll()
-> epi_fget()
-> file_ref_get(file)
mutex_unlock(&ep->mtx)
mutex_lock(&ep->mtx);
__ep_remove()
mutex_unlock(&ep->mtx);
-> kmem_cache_free(file)
(2) ttm dmabuf
This explanation is a bit more involved. A regular dmabuf file stashed
the dmabuf in file->private_data and the file in dmabuf->file:
file->private_data = dmabuf;
dmabuf->file = file;
The generic release method of a dmabuf file handles file specific
things:
f_op->release::dma_buf_file_release()
while the generic dentry release method of a dmabuf handles dmabuf
freeing including driver specific things:
dentry->d_release::dma_buf_release()
During ttm dmabuf initialization in ttm_object_device_init() the ttm
driver copies the provided struct dma_buf_ops into a private location:
struct ttm_object_device {
spinlock_t object_lock;
struct dma_buf_ops ops;
void (*dmabuf_release)(struct dma_buf *dma_buf);
struct idr idr;
};
ttm_object_device_init(const struct dma_buf_ops *ops)
{
// copy original dma_buf_ops in private location
tdev->ops = *ops;
// stash the release method of the original struct dma_buf_ops
tdev->dmabuf_release = tdev->ops.release;
// override the release method in the copy of the struct dma_buf_ops
// with ttm's own dmabuf release method
tdev->ops.release = ttm_prime_dmabuf_release;
}
When a new dmabuf is created the struct dma_buf_ops with the overriden
release method set to ttm_prime_dmabuf_release is passed in exp_info.ops:
DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
exp_info.ops = &tdev->ops;
exp_info.size = prime->size;
exp_info.flags = flags;
exp_info.priv = prime;
The call to dma_buf_export() then sets
mutex_lock_interruptible(&prime->mutex);
dma_buf = dma_buf_export(&exp_info)
{
dmabuf->ops = exp_info->ops;
}
mutex_unlock(&prime->mutex);
which creates a new dmabuf file and then install a file descriptor to
it in the callers file descriptor table:
ret = dma_buf_fd(dma_buf, flags);
When that dmabuf file is closed we now get:
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
Where we can see that prime->dma_buf is set to NULL. So when we have
the following diagram:
CPU1 CPU2
fput(file)
-> __fput(file)
-> f_op->release::dma_buf_file_release()
-> dput()
-> d_op->d_release::dma_buf_release()
-> dmabuf->ops->release::ttm_prime_dmabuf_release()
ttm_prime_handle_to_fd()
mutex_lock_interruptible(&prime->mutex)
dma_buf = prime->dma_buf
dma_buf && get_dma_buf_unless_doomed(dma_buf)
-> file_ref_get(dma_buf->file)
mutex_unlock(&prime->mutex);
mutex_lock(&prime->mutex);
if (prime->dma_buf == dma_buf)
prime->dma_buf = NULL;
mutex_unlock(&prime->mutex);
-> kmem_cache_free(file)
The logic of the mechanism is the same as for epoll: sync with
__fput() preventing the file from being freed. Here the
synchronization happens through the ttm instance's prime->mutex.
Basically, the lifetime of the dma_buf and the file are tighly
coupled.
Both (1) and (2) used to call atomic_inc_not_zero() to check whether
the file has already been marked dead and then refuse to revive it.
This is only safe because both (1) and (2) sync with __fput() and thus
prevent kmem_cache_free() on the file being called and thus prevent
the file from being immediately recycled due to SLAB_TYPESAFE_BY_RCU.
Both (1) and (2) have been ported from atomic_inc_not_zero() to
file_ref_get(). That means a file that is already in the process of
being marked as FILE_REF_DEAD:
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
can be revived again:
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
This is fine and inherent to the file_ref_get()/file_ref_put()
semantics. For both (1) and (2) this is safe because __fput() is
prevented from making progress if file_ref_get() fails due to the
aforementioned synchronization mechanisms.
Two cases need to be considered that affect both (1) epoll and (2) ttm
dmabuf:
(i) fput()'s file_ref_put() and marks the file as FILE_REF_NOREF but
before that fput() can mark the file as FILE_REF_DEAD someone
manages to sneak in a file_ref_get() and brings the refcount back
from FILE_REF_NOREF to FILE_REF_ONEREF. In that case the original
fput() doesn't call __fput(). For epoll the poll will finish and
for ttm dmabuf the file can be used again. For ttm dambuf this is
actually an advantage because it avoids immediately allocating
a new dmabuf object.
CPU1 CPU2
file_ref_put()
cnt = atomic_long_dec_return()
-> __file_ref_put(cnt)
if (cnt == FIlE_REF_NOREF)
file_ref_get()
// Brings reference back to FILE_REF_ONEREF
atomic_long_add_negative()
atomic_long_try_cmpxchg_release(cnt, FILE_REF_DEAD)
(ii) fput()'s file_ref_put() marks the file FILE_REF_NOREF and
also suceeds in actually marking it FILE_REF_DEAD and then calls
into __fput() to free the file.
When either (1) or (2) call file_ref_get() they fail as
atomic_long_add_negative() will return true.
At the same time, both (1) and (2) all file_ref_get() under
mutexes that __fput() must also acquire preventing
kmem_cache_free() from freeing the file.
So while this might be treated as a change in semantics for (1) and
(2) it really isn't. It if should end up causing issues this can be
fixed by adding a helper that does something like:
long cnt = atomic_long_read(&ref->refcnt);
do {
if (cnt < 0)
return false;
} while (!atomic_long_try_cmpxchg(&ref->refcnt, &cnt, cnt + 1));
return true;
which would block FILE_REF_NOREF to FILE_REF_ONEREF transitions.
- Jann correctly pointed out that kmem_cache_zalloc() cannot be used
anymore once files have been ported to file_ref_t.
The kmem_cache_zalloc() call will memset() the whole struct file to
zero when it is reallocated. This will also set file->f_ref to zero
which mens that a concurrent file_ref_get() can return true:
CPU1 CPU2
__get_file_rcu()
rcu_dereference_raw()
close()
[frees file]
alloc_empty_file()
kmem_cache_zalloc()
[reallocates same file]
memset(..., 0, ...)
file_ref_get()
[increments 0->1, returns true]
init_file()
file_ref_init(..., 1)
[sets to 0]
rcu_dereference_raw()
fput()
file_ref_put()
[decrements 0->FILE_REF_NOREF, frees file]
[UAF]
causing a concurrent __get_file_rcu() call to acquire a reference to
the file that is about to be reallocated and immediately freeing it
on realizing that it has been recycled. This causes a UAF for the
task that reallocated/recycled the file.
This is prevented by switching from kmem_cache_zalloc() to
kmem_cache_alloc() and initializing the fields manually. With
file->f_ref initialized last.
Note that a memset() also isn't guaranteed to atomically update an
unsigned long so it's theoretically possible to see torn and
therefore bogus counter values.
Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-3-387e24dc9163@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-10-07 14:23:59 +00:00
|
|
|
if (file_ref_put(&file->f_ref))
|
2010-05-26 19:13:55 +00:00
|
|
|
__fput(file);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(fput);
|
2022-04-03 19:58:11 +00:00
|
|
|
EXPORT_SYMBOL(__fput_sync);
|
2010-05-26 19:13:55 +00:00
|
|
|
|
2015-08-06 22:46:20 +00:00
|
|
|
void __init files_init(void)
|
2017-08-07 08:45:39 +00:00
|
|
|
{
|
2024-09-05 07:56:56 +00:00
|
|
|
struct kmem_cache_args args = {
|
|
|
|
.use_freeptr_offset = true,
|
|
|
|
.freeptr_offset = offsetof(struct file, f_freeptr),
|
|
|
|
};
|
|
|
|
|
|
|
|
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
|
|
|
|
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
|
|
|
|
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
|
2024-10-07 14:23:57 +00:00
|
|
|
|
|
|
|
args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
|
|
|
|
bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
|
|
|
|
&args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
|
|
|
|
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
|
2015-08-06 22:46:20 +00:00
|
|
|
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
|
|
|
|
}
|
2008-12-10 17:35:45 +00:00
|
|
|
|
2015-08-06 22:46:20 +00:00
|
|
|
/*
|
|
|
|
* One file with associated inode and dcache is very roughly 1K. Per default
|
|
|
|
* do not use more than 10% of our memory for files.
|
|
|
|
*/
|
|
|
|
void __init files_maxfiles_init(void)
|
|
|
|
{
|
|
|
|
unsigned long n;
|
2018-12-28 08:34:29 +00:00
|
|
|
unsigned long nr_pages = totalram_pages();
|
2018-12-28 08:34:20 +00:00
|
|
|
unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;
|
2015-08-06 22:46:20 +00:00
|
|
|
|
2018-12-28 08:34:20 +00:00
|
|
|
memreserve = min(memreserve, nr_pages - 1);
|
|
|
|
n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2010-10-26 21:22:44 +00:00
|
|
|
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
|
2017-08-07 08:45:39 +00:00
|
|
|
}
|