fuse: Convert to using invalidate_lock
Use invalidate_lock instead of fuse's private i_mmap_sem. The intended purpose is exactly the same. By this conversion we fix a long standing race between hole punching and read(2) / readahead(2) paths that can lead to stale page cache contents. CC: Miklos Szeredi <miklos@szeredi.hu> Reviewed-by: Miklos Szeredi <mszeredi@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz>
This commit is contained in:
parent
edc6d01bad
commit
8bcbbe9c7c
@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
|
||||
/*
|
||||
* Can't do inline reclaim in fault path. We call
|
||||
* dax_layout_busy_page() before we free a range. And
|
||||
* fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it.
|
||||
* In fault path we enter with fi->i_mmap_sem held and can't drop
|
||||
* it. Also in fault path we hold fi->i_mmap_sem shared and not
|
||||
* exclusive, so that creates further issues with fuse_wait_dax_page().
|
||||
* Hence return -EAGAIN and fuse_dax_fault() will wait for a memory
|
||||
* range to become free and retry.
|
||||
* fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
|
||||
* In fault path we enter with mapping->invalidate_lock held and can't
|
||||
* drop it. Also in fault path we hold mapping->invalidate_lock shared
|
||||
* and not exclusive, so that creates further issues with
|
||||
* fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
|
||||
* will wait for a memory range to become free and retry.
|
||||
*/
|
||||
if (flags & IOMAP_FAULT) {
|
||||
alloc_dmap = alloc_dax_mapping(fcd);
|
||||
@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
|
||||
down_write(&fi->dax->sem);
|
||||
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
|
||||
|
||||
/* We are holding either inode lock or i_mmap_sem, and that should
|
||||
/* We are holding either inode lock or invalidate_lock, and that should
|
||||
* ensure that dmap can't be truncated. We are holding a reference
|
||||
* on dmap and that should make sure it can't be reclaimed. So dmap
|
||||
* should still be there in tree despite the fact we dropped and
|
||||
@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = {
|
||||
|
||||
static void fuse_wait_dax_page(struct inode *inode)
|
||||
{
|
||||
struct fuse_inode *fi = get_fuse_inode(inode);
|
||||
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
schedule();
|
||||
down_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
}
|
||||
|
||||
/* Should be called with fi->i_mmap_sem lock held exclusively */
|
||||
/* Should be called with mapping->invalidate_lock held exclusively */
|
||||
static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
|
||||
loff_t start, loff_t end)
|
||||
{
|
||||
@ -813,18 +811,18 @@ retry:
|
||||
* we do not want any read/write/mmap to make progress and try
|
||||
* to populate page cache or access memory we are trying to free.
|
||||
*/
|
||||
down_read(&get_fuse_inode(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock_shared(inode->i_mapping);
|
||||
ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
|
||||
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
|
||||
error = 0;
|
||||
retry = true;
|
||||
up_read(&get_fuse_inode(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(inode->i_mapping);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (ret & VM_FAULT_NEEDDSYNC)
|
||||
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
|
||||
up_read(&get_fuse_inode(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(inode->i_mapping);
|
||||
|
||||
if (write)
|
||||
sb_end_pagefault(sb);
|
||||
@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
|
||||
int ret;
|
||||
struct interval_tree_node *node;
|
||||
|
||||
down_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
|
||||
/* Lookup a dmap and corresponding file offset to reclaim. */
|
||||
down_read(&fi->dax->sem);
|
||||
@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
|
||||
out_write_dmap_sem:
|
||||
up_write(&fi->dax->sem);
|
||||
out_mmap_sem:
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
return dmap;
|
||||
}
|
||||
|
||||
@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
|
||||
* had a reference or some other temporary failure,
|
||||
* Try again. We want to give up inline reclaim only
|
||||
* if there is no range assigned to this node. Otherwise
|
||||
* if a deadlock is possible if we sleep with fi->i_mmap_sem
|
||||
* held and worker to free memory can't make progress due
|
||||
* to unavailability of fi->i_mmap_sem lock. So sleep
|
||||
* only if fi->dax->nr=0
|
||||
* if a deadlock is possible if we sleep with
|
||||
* mapping->invalidate_lock held and worker to free memory
|
||||
* can't make progress due to unavailability of
|
||||
* mapping->invalidate_lock. So sleep only if fi->dax->nr=0
|
||||
*/
|
||||
if (retry)
|
||||
continue;
|
||||
@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
|
||||
* There are no mappings which can be reclaimed. Wait for one.
|
||||
* We are not holding fi->dax->sem. So it is possible
|
||||
* that range gets added now. But as we are not holding
|
||||
* fi->i_mmap_sem, worker should still be able to free up
|
||||
* a range and wake us up.
|
||||
* mapping->invalidate_lock, worker should still be able to
|
||||
* free up a range and wake us up.
|
||||
*/
|
||||
if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
|
||||
if (wait_event_killable_exclusive(fcd->range_waitq,
|
||||
@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
|
||||
/*
|
||||
* Free a range of memory.
|
||||
* Locking:
|
||||
* 1. Take fi->i_mmap_sem to block dax faults.
|
||||
* 1. Take mapping->invalidate_lock to block dax faults.
|
||||
* 2. Take fi->dax->sem to protect interval tree and also to make sure
|
||||
* read/write can not reuse a dmap which we might be freeing.
|
||||
*/
|
||||
@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
|
||||
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
|
||||
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
|
||||
|
||||
down_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
|
||||
if (ret) {
|
||||
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
|
||||
@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
|
||||
ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
|
||||
up_write(&fi->dax->sem);
|
||||
out_mmap_sem:
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
|
||||
struct fuse_mount *fm = get_fuse_mount(inode);
|
||||
struct fuse_conn *fc = fm->fc;
|
||||
struct fuse_inode *fi = get_fuse_inode(inode);
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
FUSE_ARGS(args);
|
||||
struct fuse_setattr_in inarg;
|
||||
struct fuse_attr_out outarg;
|
||||
@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
|
||||
}
|
||||
|
||||
if (FUSE_IS_DAX(inode) && is_truncate) {
|
||||
down_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
fault_blocked = true;
|
||||
err = fuse_dax_break_layouts(inode, 0, 0);
|
||||
if (err) {
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
|
||||
if ((is_truncate || !is_wb) &&
|
||||
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
|
||||
truncate_pagecache(inode, outarg.attr.size);
|
||||
invalidate_inode_pages2(inode->i_mapping);
|
||||
invalidate_inode_pages2(mapping);
|
||||
}
|
||||
|
||||
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
|
||||
out:
|
||||
if (fault_blocked)
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
|
||||
return 0;
|
||||
|
||||
@ -1711,7 +1712,7 @@ error:
|
||||
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
|
||||
|
||||
if (fault_blocked)
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
|
||||
}
|
||||
|
||||
if (dax_truncate) {
|
||||
down_write(&get_fuse_inode(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
err = fuse_dax_break_layouts(inode, 0, 0);
|
||||
if (err)
|
||||
goto out;
|
||||
@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
|
||||
|
||||
out:
|
||||
if (dax_truncate)
|
||||
up_write(&get_fuse_inode(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
|
||||
if (is_wb_truncate | dax_truncate) {
|
||||
fuse_release_nowrite(inode);
|
||||
@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
|
||||
if (lock_inode) {
|
||||
inode_lock(inode);
|
||||
if (block_faults) {
|
||||
down_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
err = fuse_dax_break_layouts(inode, 0, 0);
|
||||
if (err)
|
||||
goto out;
|
||||
@ -2976,7 +2976,7 @@ out:
|
||||
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
|
||||
|
||||
if (block_faults)
|
||||
up_write(&fi->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
|
||||
if (lock_inode)
|
||||
inode_unlock(inode);
|
||||
@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
|
||||
* modifications. Yet this does give less guarantees than if the
|
||||
* copying was performed with write(2).
|
||||
*
|
||||
* To fix this a i_mmap_sem style lock could be used to prevent new
|
||||
* To fix this a mapping->invalidate_lock could be used to prevent new
|
||||
* faults while the copy is ongoing.
|
||||
*/
|
||||
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
|
||||
|
@ -149,13 +149,6 @@ struct fuse_inode {
|
||||
/** Lock to protect write related fields */
|
||||
spinlock_t lock;
|
||||
|
||||
/**
|
||||
* Can't take inode lock in fault path (leads to circular dependency).
|
||||
* Introduce another semaphore which can be taken in fault path and
|
||||
* then other filesystem paths can take this to block faults.
|
||||
*/
|
||||
struct rw_semaphore i_mmap_sem;
|
||||
|
||||
#ifdef CONFIG_FUSE_DAX
|
||||
/*
|
||||
* Dax specific inode data
|
||||
|
@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
|
||||
fi->orig_ino = 0;
|
||||
fi->state = 0;
|
||||
mutex_init(&fi->mutex);
|
||||
init_rwsem(&fi->i_mmap_sem);
|
||||
spin_lock_init(&fi->lock);
|
||||
fi->forget = fuse_alloc_forget();
|
||||
if (!fi->forget)
|
||||
|
Loading…
Reference in New Issue
Block a user