linux/fs/nfs/file.c
Trond Myklebust 72cb77f4a5 NFS: Throttle page dirtying while we're flushing to disk
The following patch is a combination of a patch by myself and Peter
Staubach.

Trond: If we allow other processes to dirty pages while a process is doing
a consistency sync to disk, we can end up never making progress.

Peter: Attached is a patch which addresses a continuing problem with
the NFS client generating out of order WRITE requests.  While
this is compliant with all of the current protocol
specifications, there are servers in the market which can not
handle out of order WRITE requests very well.  Also, this may
lead to sub-optimal block allocations in the underlying file
system on the server.  This may cause the read throughputs to
be reduced when reading the file from the server.

Peter: There has been a lot of work recently done to address out of
order issues on a systemic level.  However, the NFS client is
still susceptible to the problem.  Out of order WRITE
requests can occur when pdflush is in the middle of writing
out pages while the process dirtying the pages calls
generic_file_buffered_write which calls
generic_perform_write which calls
balance_dirty_pages_rate_limited which ends up calling
writeback_inodes which ends up calling back into the NFS
client to writes out dirty pages for the same file that
pdflush happens to be working with.

Signed-off-by: Peter Staubach <staubach@redhat.com>
[modification by Trond to merge the two similar patches]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
2009-03-11 14:10:30 -04:00

739 lines
19 KiB
C

/*
* linux/fs/nfs/file.c
*
* Copyright (C) 1992 Rick Sladkey
*
* Changes Copyright (C) 1994 by Florian La Roche
* - Do not copy data too often around in the kernel.
* - In nfs_file_read the return value of kmalloc wasn't checked.
* - Put in a better version of read look-ahead buffering. Original idea
* and implementation by Wai S Kok elekokws@ee.nus.sg.
*
* Expire cache on write to a file by Wai S Kok (Oct 1994).
*
* Total rewrite of read side for new NFS buffer cache.. Linus.
*
* nfs regular file handling functions
*/
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
#define NFSDBG_FACILITY NFSDBG_FILE
static int nfs_file_open(struct inode *, struct file *);
static int nfs_file_release(struct inode *, struct file *);
static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
static int nfs_file_mmap(struct file *, struct vm_area_struct *);
static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t count, unsigned int flags);
static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static int nfs_file_flush(struct file *, fl_owner_t id);
static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
static int nfs_check_flags(int flags);
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
static struct vm_operations_struct nfs_file_vm_ops;
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = nfs_file_read,
.aio_write = nfs_file_write,
#ifdef CONFIG_MMU
.mmap = nfs_file_mmap,
#else
.mmap = generic_file_mmap,
#endif
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
.fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
};
const struct inode_operations nfs_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
};
#ifdef CONFIG_NFS_V3
const struct inode_operations nfs3_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
.listxattr = nfs3_listxattr,
.getxattr = nfs3_getxattr,
.setxattr = nfs3_setxattr,
.removexattr = nfs3_removexattr,
};
#endif /* CONFIG_NFS_v3 */
/* Hack for future NFS swap support */
#ifndef IS_SWAPFILE
# define IS_SWAPFILE(inode) (0)
#endif
static int nfs_check_flags(int flags)
{
if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
return -EINVAL;
return 0;
}
/*
* Open file
*/
static int
nfs_file_open(struct inode *inode, struct file *filp)
{
int res;
dprintk("NFS: open file(%s/%s)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name);
res = nfs_check_flags(filp->f_flags);
if (res)
return res;
nfs_inc_stats(inode, NFSIOS_VFSOPEN);
res = nfs_open(inode, filp);
return res;
}
static int
nfs_file_release(struct inode *inode, struct file *filp)
{
struct dentry *dentry = filp->f_path.dentry;
dprintk("NFS: release(%s/%s)\n",
dentry->d_parent->d_name.name,
dentry->d_name.name);
/* Ensure that dirty pages are flushed out with the right creds */
if (filp->f_mode & FMODE_WRITE)
nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
return nfs_release(inode, filp);
}
/**
* nfs_revalidate_size - Revalidate the file size
* @inode - pointer to inode struct
* @file - pointer to struct file
*
* Revalidates the file length. This is basically a wrapper around
* nfs_revalidate_inode() that takes into account the fact that we may
* have cached writes (in which case we don't care about the server's
* idea of what the file length is), or O_DIRECT (in which case we
* shouldn't trust the cache).
*/
static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
if (server->flags & NFS_MOUNT_NOAC)
goto force_reval;
if (filp->f_flags & O_DIRECT)
goto force_reval;
if (nfsi->npages != 0)
return 0;
if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
return 0;
force_reval:
return __nfs_revalidate_inode(server, inode);
}
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
loff_t loff;
dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name,
offset, origin);
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
int retval = nfs_revalidate_file_size(inode, filp);
if (retval < 0)
return (loff_t)retval;
spin_lock(&inode->i_lock);
loff = generic_file_llseek_unlocked(filp, offset, origin);
spin_unlock(&inode->i_lock);
} else
loff = generic_file_llseek_unlocked(filp, offset, origin);
return loff;
}
/*
* Helper for nfs_file_flush() and nfs_file_fsync()
*
* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
* disk, but it retrieves and clears ctx->error after synching, despite
* the two being set at the same time in nfs_context_set_write_error().
* This is because the former is used to notify the _next_ call to
* nfs_file_write() that a write error occured, and hence cause it to
* fall back to doing a synchronous write.
*/
static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
{
int have_error, status;
int ret = 0;
have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
status = nfs_wb_all(inode);
have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
if (have_error)
ret = xchg(&ctx->error, 0);
if (!ret)
ret = status;
return ret;
}
/*
* Flush all dirty pages, and check for write errors.
*/
static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
int status;
dprintk("NFS: flush(%s/%s)\n",
dentry->d_parent->d_name.name,
dentry->d_name.name);
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
/* Ensure that data+attribute caches are up to date after close() */
status = nfs_do_fsync(ctx, inode);
if (!status)
nfs_revalidate_inode(NFS_SERVER(inode), inode);
return status;
}
static ssize_t
nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;
size_t count = iov_length(iov, nr_segs);
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
if (!result)
result = generic_file_aio_read(iocb, iov, nr_segs, pos);
return result;
}
static ssize_t
nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct pipe_inode_info *pipe, size_t count,
unsigned int flags)
{
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
ssize_t res;
dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long long) *ppos);
res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res)
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
return res;
}
static int
nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
{
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
int status;
dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
status = nfs_revalidate_mapping(inode, file->f_mapping);
if (!status) {
vma->vm_ops = &nfs_file_vm_ops;
vma->vm_flags |= VM_CAN_NONLINEAR;
file_accessed(file);
}
return status;
}
/*
* Flush any dirty pages for this process, and check for write errors.
* The return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
*/
static int
nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = dentry->d_inode;
dprintk("NFS: fsync file(%s/%s) datasync %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
return nfs_do_fsync(ctx, inode);
}
/*
* This does the "real" work of the write. We must allocate and lock the
* page to be sent back to the generic routine, which then copies the
* data from user space.
*
* If the writer ends up delaying the write, the writer needs to
* increment the page use counts until he is done with the page.
*/
static int nfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int ret;
pgoff_t index;
struct page *page;
index = pos >> PAGE_CACHE_SHIFT;
dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
/*
* Prevent starvation issues if someone is doing a consistency
* sync-to-disk
*/
ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
return ret;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
ret = nfs_flush_incompatible(file, page);
if (ret) {
unlock_page(page);
page_cache_release(page);
}
return ret;
}
static int nfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
int status;
dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
/*
* Zero any uninitialised parts of the page, and then mark the page
* as up to date if it turns out that we're extending the file.
*/
if (!PageUptodate(page)) {
unsigned pglen = nfs_page_length(page);
unsigned end = offset + len;
if (pglen == 0) {
zero_user_segments(page, 0, offset,
end, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else if (end >= pglen) {
zero_user_segment(page, end, PAGE_CACHE_SIZE);
if (offset == 0)
SetPageUptodate(page);
} else
zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
}
status = nfs_updatepage(file, page, offset, copied);
unlock_page(page);
page_cache_release(page);
if (status < 0)
return status;
return copied;
}
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page->mapping->host, page);
}
static int nfs_release_page(struct page *page, gfp_t gfp)
{
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* If PagePrivate() is set, then the page is not freeable */
return 0;
}
static int nfs_launder_page(struct page *page)
{
struct inode *inode = page->mapping->host;
dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
inode->i_ino, (long long)page_offset(page));
return nfs_wb_page(inode, page);
}
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
.readpages = nfs_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
.writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
.invalidatepage = nfs_invalidate_page,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
.launder_page = nfs_launder_page,
};
static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
struct file *filp = vma->vm_file;
struct dentry *dentry = filp->f_path.dentry;
unsigned pagelen;
int ret = -EINVAL;
struct address_space *mapping;
dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
filp->f_mapping->host->i_ino,
(long long)page_offset(page));
lock_page(page);
mapping = page->mapping;
if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
ret = 0;
pagelen = nfs_page_length(page);
if (pagelen == 0)
goto out_unlock;
ret = nfs_flush_incompatible(filp, page);
if (ret != 0)
goto out_unlock;
ret = nfs_updatepage(filp, page, 0, pagelen);
if (ret == 0)
ret = pagelen;
out_unlock:
unlock_page(page);
return ret;
}
static struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = nfs_vm_page_mkwrite,
};
static int nfs_need_sync_write(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx;
if (IS_SYNC(inode) || (filp->f_flags & O_SYNC))
return 1;
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags))
return 1;
return 0;
}
static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;
size_t count = iov_length(iov, nr_segs);
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, iov, nr_segs, pos);
dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
/*
* O_APPEND implies that we must revalidate the file length.
*/
if (iocb->ki_filp->f_flags & O_APPEND) {
result = nfs_revalidate_file_size(inode, iocb->ki_filp);
if (result)
goto out;
}
result = count;
if (!count)
goto out;
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
result = generic_file_aio_write(iocb, iov, nr_segs, pos);
/* Return error values for O_SYNC and IS_SYNC() */
if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
if (err < 0)
result = err;
}
out:
return result;
out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
goto out;
}
static int do_getlk(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = filp->f_mapping->host;
int status = 0;
lock_kernel();
/* Try local locking first */
posix_test_lock(filp, fl);
if (fl->fl_type != F_UNLCK) {
/* found a conflict */
goto out;
}
if (nfs_have_delegation(inode, FMODE_READ))
goto out_noconflict;
if (NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)
goto out_noconflict;
status = NFS_PROTO(inode)->lock(filp, cmd, fl);
out:
unlock_kernel();
return status;
out_noconflict:
fl->fl_type = F_UNLCK;
goto out;
}
static int do_vfs_lock(struct file *file, struct file_lock *fl)
{
int res = 0;
switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
case FL_POSIX:
res = posix_lock_file_wait(file, fl);
break;
case FL_FLOCK:
res = flock_lock_file_wait(file, fl);
break;
default:
BUG();
}
if (res < 0)
dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
" - error %d!\n",
__func__, res);
return res;
}
static int do_unlk(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = filp->f_mapping->host;
int status;
/*
* Flush all pending writes before doing anything
* with locks..
*/
nfs_sync_mapping(filp->f_mapping);
/* NOTE: special case
* If we're signalled while cleaning up locks on process exit, we
* still need to complete the unlock.
*/
lock_kernel();
/* Use local locking if mounted with "-onolock" */
if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
status = NFS_PROTO(inode)->lock(filp, cmd, fl);
else
status = do_vfs_lock(filp, fl);
unlock_kernel();
return status;
}
static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = filp->f_mapping->host;
int status;
/*
* Flush all pending writes before doing anything
* with locks..
*/
status = nfs_sync_mapping(filp->f_mapping);
if (status != 0)
goto out;
lock_kernel();
/* Use local locking if mounted with "-onolock" */
if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM))
status = NFS_PROTO(inode)->lock(filp, cmd, fl);
else
status = do_vfs_lock(filp, fl);
unlock_kernel();
if (status < 0)
goto out;
/*
* Make sure we clear the cache whenever we try to get the lock.
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
if (!nfs_have_delegation(inode, FMODE_READ))
nfs_zap_caches(inode);
out:
return status;
}
/*
* Lock a (portion of) a file
*/
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
{
struct inode *inode = filp->f_mapping->host;
int ret = -ENOLCK;
dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
/* No mandatory locks over NFS */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
goto out_err;
if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
ret = NFS_PROTO(inode)->lock_check_bounds(fl);
if (ret < 0)
goto out_err;
}
if (IS_GETLK(cmd))
ret = do_getlk(filp, cmd, fl);
else if (fl->fl_type == F_UNLCK)
ret = do_unlk(filp, cmd, fl);
else
ret = do_setlk(filp, cmd, fl);
out_err:
return ret;
}
/*
* Lock a (portion of) a file
*/
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
{
dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
filp->f_path.dentry->d_parent->d_name.name,
filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
/* We're simulating flock() locks using posix locks on the server */
fl->fl_owner = (fl_owner_t)filp;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
if (fl->fl_type == F_UNLCK)
return do_unlk(filp, cmd, fl);
return do_setlk(filp, cmd, fl);
}
/*
* There is no protocol support for leases, so we have no way to implement
* them correctly in the face of opens by other clients.
*/
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
{
dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name, arg);
return -EINVAL;
}