fs: introduce write_begin, write_end, and perform_write aops

These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-11-22 20:22:09 +00:00 · 2007-10-16 01:25:01 -07:00 · 2007-10-16 01:25:01 -07:00 · afddba49d1
commit afddba49d1
parent 637aff46f9
11 changed files with 575 additions and 206 deletions
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@ -178,15 +178,18 @@ prototypes:
 locking rules:
 	All except set_page_dirty may block
-			BKL	PageLocked(page)
+			BKL	PageLocked(page)	i_sem
 writepage:		no	yes, unlocks (see below)
 readpage:		no	yes, unlocks
 sync_page:		no	maybe
 writepages:		no
 set_page_dirty		no	no
 readpages:		no
-prepare_write:		no	yes
+prepare_write:		no	yes			yes
-commit_write:		no	yes
+commit_write:		no	yes			yes
 write_begin:		no	locks the page		yes
 write_end:		no	yes, unlocks		yes
 perform_write:		no	n/a			yes
 bmap:			yes
 invalidatepage:		no	yes
 releasepage:		no	yes
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@ -537,6 +537,12 @@ struct address_space_operations {
 			struct list_head *pages, unsigned nr_pages);
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
 	int (*invalidatepage) (struct page *, unsigned long);
 	int (*releasepage) (struct page *, int);
@ -633,6 +639,45 @@ struct address_space_operations {
        operations.  It should avoid returning an error if possible -
        errors should have been handled by prepare_write.
  write_begin: This is intended as a replacement for prepare_write. The
 	key differences being that:
 		- it returns a locked page (in *pagep) rather than being
 		  given a pre locked page;
 		- it must be able to cope with short writes (where the
 		  length passed to write_begin is greater than the number
 		  of bytes copied into the page).
 	Called by the generic buffered write code to ask the filesystem to
 	prepare to write len bytes at the given offset in the file. The
 	address_space should check that the write will be able to complete,
 	by allocating space if necessary and doing any other internal
 	housekeeping.  If the write will update parts of any basic-blocks on
 	storage, then those blocks should be pre-read (if they haven't been
 	read already) so that the updated blocks can be written out properly.
        The filesystem must return the locked pagecache page for the specified
 	offset, in *pagep, for the caller to write into.
 	flags is a field for AOP_FLAG_xxx flags, described in
 	include/linux/fs.h.
        A void * may be returned in fsdata, which then gets passed into
        write_end.
        Returns 0 on success; < 0 on failure (which is the error code), in
 	which case write_end is not called.
  write_end: After a successful write_begin, and data copy, write_end must
        be called. len is the original len passed to write_begin, and copied
        is the amount that was able to be copied (copied == len is always true
 	if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
        The filesystem must take care of unlocking the page and releasing it
        refcount, and updating i_size.
        Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
        that were able to be copied into pagecache.
  bmap: called by the VFS to map a logical block offset within object to
  	physical block number. This method is used by the FIBMAP
  	ioctl and for working with swap-files.  To be able to swap to
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@ -204,14 +204,13 @@ lo_do_transfer(struct loop_device *lo, int cmd,
 * do_lo_send_aops - helper for writing data to a loop device
 *
 * This is the fast version for backing filesystems which implement the address
- * space operations prepare_write and commit_write.
+ * space operations write_begin and write_end.
 */
 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
-		int bsize, loff_t pos, struct page *page)
+		int bsize, loff_t pos, struct page *unused)
 {
 	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *aops = mapping->a_ops;
 	pgoff_t index;
 	unsigned offset, bv_offs;
 	int len, ret;
@ -223,63 +222,47 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
 	len = bvec->bv_len;
 	while (len > 0) {
 		sector_t IV;
-		unsigned size;
+		unsigned size, copied;
 		int transfer_result;
 		struct page *page;
 		void *fsdata;
 		IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
 		size = PAGE_CACHE_SIZE - offset;
 		if (size > len)
 			size = len;
-		page = grab_cache_page(mapping, index);
+
-		if (unlikely(!page))
+		ret = pagecache_write_begin(file, mapping, pos, size, 0,
 							&page, &fsdata);
 		if (ret)
 			goto fail;
-		ret = aops->prepare_write(file, page, offset,
+
 					  offset + size);
 		if (unlikely(ret)) {
 			if (ret == AOP_TRUNCATED_PAGE) {
 				page_cache_release(page);
 				continue;
 			}
 			goto unlock;
 		}
 		transfer_result = lo_do_transfer(lo, WRITE, page, offset,
 				bvec->bv_page, bv_offs, size, IV);
-		if (unlikely(transfer_result)) {
+		copied = size;
 			/*
 			 * The transfer failed, but we still write the data to
 			 * keep prepare/commit calls balanced.
 			 */
 			printk(KERN_ERR "loop: transfer error block %llu\n",
 			       (unsigned long long)index);
 			zero_user_page(page, offset, size, KM_USER0);
 		}
 		flush_dcache_page(page);
 		ret = aops->commit_write(file, page, offset,
 					 offset + size);
 		if (unlikely(ret)) {
 			if (ret == AOP_TRUNCATED_PAGE) {
 				page_cache_release(page);
 				continue;
 			}
 			goto unlock;
 		}
 		if (unlikely(transfer_result))
-			goto unlock;
+			copied = 0;
-		bv_offs += size;
+
-		len -= size;
+		ret = pagecache_write_end(file, mapping, pos, size, copied,
 							page, fsdata);
 		if (ret < 0)
 			goto fail;
 		if (ret < copied)
 			copied = ret;
 		if (unlikely(transfer_result))
 			goto fail;
 		bv_offs += copied;
 		len -= copied;
 		offset = 0;
 		index++;
-		pos += size;
+		pos += copied;
 		unlock_page(page);
 		page_cache_release(page);
 	}
 	ret = 0;
 out:
 	mutex_unlock(&mapping->host->i_mutex);
 	return ret;
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
 fail:
 	ret = -1;
 	goto out;
@ -313,7 +296,7 @@ static int __do_lo_send_write(struct file *file,
 * do_lo_send_direct_write - helper for writing data to a loop device
 *
 * This is the fast, non-transforming version for backing filesystems which do
- * not implement the address space operations prepare_write and commit_write.
+ * not implement the address space operations write_begin and write_end.
 * It uses the write file operation which should be present on all writeable
 * filesystems.
 */
@ -332,7 +315,7 @@ static int do_lo_send_direct_write(struct loop_device *lo,
 * do_lo_send_write - helper for writing data to a loop device
 *
 * This is the slow, transforming version for filesystems which do not
- * implement the address space operations prepare_write and commit_write.  It
+ * implement the address space operations write_begin and write_end.  It
 * uses the write file operation which should be present on all writeable
 * filesystems.
 *
@ -780,7 +763,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 		 */
 		if (!file->f_op->splice_read)
 			goto out_putf;
-		if (aops->prepare_write && aops->commit_write)
+		if (aops->prepare_write || aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
--- a/fs/buffer.c
+++ b/fs/buffer.c
@ -1770,6 +1770,48 @@ recover:
 	goto done;
 }
 /*
 * If a page has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
 	unsigned int block_start, block_end;
 	struct buffer_head *head, *bh;
 	BUG_ON(!PageLocked(page));
 	if (!page_has_buffers(page))
 		return;
 	bh = head = page_buffers(page);
 	block_start = 0;
 	do {
 		block_end = block_start + bh->b_size;
 		if (buffer_new(bh)) {
 			if (block_end > from && block_start < to) {
 				if (!PageUptodate(page)) {
 					unsigned start, size;
 					start = max(from, block_start);
 					size = min(to, block_end) - start;
 					zero_user_page(page, start, size, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 				clear_buffer_new(bh);
 				mark_buffer_dirty(bh);
 			}
 		}
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 }
 EXPORT_SYMBOL(page_zero_new_buffers);
 static int __block_prepare_write(struct inode *inode, struct page *page,
 		unsigned from, unsigned to, get_block_t *get_block)
 {
@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 		if (!buffer_uptodate(*wait_bh))
 			err = -EIO;
 	}
-	if (!err) {
+	if (unlikely(err))
-		bh = head;
+		page_zero_new_buffers(page, from, to);
 		do {
 			if (buffer_new(bh))
 				clear_buffer_new(bh);
 		} while ((bh = bh->b_this_page) != head);
 		return 0;
 	}
 	/* Error case: */
 	/*
 	 * Zero out any newly allocated blocks to avoid exposing stale
 	 * data.  If BH_New is set, we know that the block was newly
 	 * allocated in the above loop.
 	 */
 	bh = head;
 	block_start = 0;
 	do {
 		block_end = block_start+blocksize;
 		if (block_end <= from)
 			goto next_bh;
 		if (block_start >= to)
 			break;
 		if (buffer_new(bh)) {
 			clear_buffer_new(bh);
 			zero_user_page(page, block_start, bh->b_size, KM_USER0);
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
 next_bh:
 		block_start = block_end;
 		bh = bh->b_this_page;
 	} while (bh != head);
 	return err;
 }
@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
 		}
 		clear_buffer_new(bh);
 	}
 	/*
@ -1923,6 +1936,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 	return 0;
 }
 /*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * If *pagep is not NULL, then block_write_begin uses the locked page
 * at *pagep rather than allocating its own. In this case, the page will
 * not be unlocked or deallocated on failure.
 */
 int block_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata,
 			get_block_t *get_block)
 {
 	struct inode *inode = mapping->host;
 	int status = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned start, end;
 	int ownpage = 0;
 	index = pos >> PAGE_CACHE_SHIFT;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + len;
 	page = *pagep;
 	if (page == NULL) {
 		ownpage = 1;
 		page = __grab_cache_page(mapping, index);
 		if (!page) {
 			status = -ENOMEM;
 			goto out;
 		}
 		*pagep = page;
 	} else
 		BUG_ON(!PageLocked(page));
 	status = __block_prepare_write(inode, page, start, end, get_block);
 	if (unlikely(status)) {
 		ClearPageUptodate(page);
 		if (ownpage) {
 			unlock_page(page);
 			page_cache_release(page);
 			*pagep = NULL;
 			/*
 			 * prepare_write() may have instantiated a few blocks
 			 * outside i_size.  Trim these off again. Don't need
 			 * i_size_read because we hold i_mutex.
 			 */
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		}
 		goto out;
 	}
 out:
 	return status;
 }
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	unsigned start;
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	if (unlikely(copied < len)) {
 		/*
 		 * The buffers that were written will now be uptodate, so we
 		 * don't have to worry about a readpage reading them and
 		 * overwriting a partial write. However if we have encountered
 		 * a short write and only partially written into a buffer, it
 		 * will not be marked uptodate, so a readpage might come in and
 		 * destroy our partial write.
 		 *
 		 * Do the simplest thing, and just treat any short write to a
 		 * non uptodate page as a zero-length write, and force the
 		 * caller to redo the whole thing.
 		 */
 		if (!PageUptodate(page))
 			copied = 0;
 		page_zero_new_buffers(page, start+copied, start+len);
 	}
 	flush_dcache_page(page);
 	/* This could be a short (even 0-length) commit */
 	__block_commit_write(inode, page, start, start+copied);
 	return copied;
 }
 EXPORT_SYMBOL(block_write_end);
 int generic_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold i_mutex.
 	 *
 	 * But it's important to update i_size while still holding page lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
 		mark_inode_dirty(inode);
 	}
 	unlock_page(page);
 	page_cache_release(page);
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
 /*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
--- a/fs/libfs.c
+++ b/fs/libfs.c
@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page,
 	return 0;
 }
 int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
 	struct page *page;
 	pgoff_t index;
 	unsigned from;
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	page = __grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 	return simple_prepare_write(file, page, from, from+len);
 }
 int simple_commit_write(struct file *file, struct page *page,
 			unsigned from, unsigned to)
 {
@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page,
 	return 0;
 }
 int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	/* zero the stale part of the page if we did a short copy */
 	if (copied < len) {
 		void *kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + from + copied, 0, len - copied);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr, KM_USER0);
 	}
 	simple_commit_write(file, page, from, from+copied);
 	unlock_page(page);
 	page_cache_release(page);
 	return copied;
 }
 /*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
 EXPORT_SYMBOL(dcache_readdir);
 EXPORT_SYMBOL(generic_read_dir);
 EXPORT_SYMBOL(get_sb_pseudo);
 EXPORT_SYMBOL(simple_write_begin);
 EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_commit_write);
 EXPORT_SYMBOL(simple_dir_inode_operations);
 EXPORT_SYMBOL(simple_dir_operations);
--- a/fs/namei.c
+++ b/fs/namei.c
@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
 	int err;
 	char *kaddr;
 retry:
-	err = -ENOMEM;
+	err = pagecache_write_begin(NULL, mapping, 0, len-1,
-	page = find_or_create_page(mapping, 0, gfp_mask);
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (!page)
 		goto fail;
 	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
 	if (err == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto retry;
 	}
 	if (err)
-		goto fail_map;
+		goto fail;
 	kaddr = kmap_atomic(page, KM_USER0);
 	memcpy(kaddr, symname, len-1);
 	kunmap_atomic(kaddr, KM_USER0);
-	err = mapping->a_ops->commit_write(NULL, page, 0, len-1);
+
-	if (err == AOP_TRUNCATED_PAGE) {
+	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
-		page_cache_release(page);
+							page, fsdata);
 		goto retry;
 	}
 	if (err)
 		goto fail_map;
 	/*
 	 * Notice that we are _not_ going to block here - end of page is
 	 * unmapped, so this will only try to map the rest of page, see
 	 * that it is unmapped (typically even will not look into inode -
 	 * ->i_size will be enough for everything) and zero it out.
 	 * OTOH it's obviously correct and should make the page up-to-date.
 	 */
 	if (!PageUptodate(page)) {
 		err = mapping->a_ops->readpage(NULL, page);
 		if (err != AOP_TRUNCATED_PAGE)
 			wait_on_page_locked(page);
 	} else {
 		unlock_page(page);
 	}
 	page_cache_release(page);
 	if (err < 0)
 		goto fail;
 	if (err < len-1)
 		goto retry;
 	mark_inode_dirty(inode);
 	return 0;
 fail_map:
 	unlock_page(page);
 	page_cache_release(page);
 fail:
 	return err;
 }
--- a/fs/splice.c
+++ b/fs/splice.c
@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	struct address_space *mapping = file->f_mapping;
 	unsigned int offset, this_len;
 	struct page *page;
-	pgoff_t index;
+	void *fsdata;
 	int ret;
 	/*
@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	if (unlikely(ret))
 		return ret;
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 	this_len = sd->len;
 	if (this_len + offset > PAGE_CACHE_SIZE)
 		this_len = PAGE_CACHE_SIZE - offset;
-find_page:
+	ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
-	page = find_lock_page(mapping, index);
+				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
 	if (!page) {
 		ret = -ENOMEM;
 		page = page_cache_alloc_cold(mapping);
 		if (unlikely(!page))
 			goto out_ret;
 		/*
 		 * This will also lock the page
 		 */
 		ret = add_to_page_cache_lru(page, mapping, index,
 					    GFP_KERNEL);
 	if (unlikely(ret))
-			goto out_release;
+		goto out;
 	}
 	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
 	if (unlikely(ret)) {
 		loff_t isize = i_size_read(mapping->host);
 		if (ret != AOP_TRUNCATED_PAGE)
 			unlock_page(page);
 		page_cache_release(page);
 		if (ret == AOP_TRUNCATED_PAGE)
 			goto find_page;
 		/*
 		 * prepare_write() may have instantiated a few blocks
 		 * outside i_size.  Trim these off again.
 		 */
 		if (sd->pos + this_len > isize)
 			vmtruncate(mapping->host, isize);
 		goto out_ret;
 	}
 	if (buf->page != page) {
 		/*
@ -629,31 +596,9 @@ find_page:
 		kunmap_atomic(dst, KM_USER1);
 		buf->ops->unmap(pipe, buf, src);
 	}
-
+	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
-	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
+				page, fsdata);
 	if (ret) {
 		if (ret == AOP_TRUNCATED_PAGE) {
 			page_cache_release(page);
 			goto find_page;
 		}
 		if (ret < 0)
 			goto out;
 		/*
 		 * Partial write has happened, so 'ret' already initialized by
 		 * number of bytes written, Where is nothing we have to do here.
 		 */
 	} else
 		ret = this_len;
 	/*
 	 * Return the number of bytes written and mark page as
 	 * accessed, we are now done!
 	 */
 	mark_page_accessed(page);
 out:
 	unlock_page(page);
 out_release:
 	page_cache_release(page);
 out_ret:
 	return ret;
 }
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@ -203,6 +203,16 @@ void block_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
 int block_write_begin(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page **, void **, get_block_t*);
 int block_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
 int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
 				loff_t *);
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -394,6 +394,8 @@ enum positive_aop_returns {
 	AOP_TRUNCATED_PAGE	= 0x80001,
 };
 #define AOP_FLAG_UNINTERRUPTIBLE	0x0001 /* will not do a short write */
 /*
 * oh the beauties of C type declarations.
 */
@ -413,7 +415,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 size_t iov_iter_copy_from_user(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i);
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(struct iov_iter *i);
 static inline void iov_iter_init(struct iov_iter *i,
@ -454,6 +456,14 @@ struct address_space_operations {
 	 */
 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
 	int (*write_begin)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
@ -468,6 +478,18 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 };
 /*
 * pagecache_write_begin/pagecache_write_end must be used by general code
 * to write into the pagecache.
 */
 int pagecache_write_begin(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata);
 int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
@ -1866,6 +1888,12 @@ extern int simple_prepare_write(struct file *file, struct page *page,
 			unsigned offset, unsigned to);
 extern int simple_commit_write(struct file *file, struct page *page,
 				unsigned offset, unsigned to);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata);
 extern int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata);
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@ -96,6 +96,8 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
 /*
 * Returns locked page at given index in given cache, creating it if needed.
 */
--- a/mm/filemap.c
+++ b/mm/filemap.c
@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
 	i->count -= bytes;
 }
-int iov_iter_fault_in_readable(struct iov_iter *i)
+/*
 * Fault in the first iovec of the given iov_iter, to a maximum length
 * of bytes. Returns 0 on success, or non-zero if the memory could not be
 * accessed (ie. because it is an invalid address).
 *
 * writev-intensive code may want this to prefault several iovecs -- that
 * would be possible (callers must not rely on the fact that _only_ the
 * first iovec will be faulted with the current implementation).
 */
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
 	size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
 	char __user *buf = i->iov->iov_base + i->iov_offset;
-	return fault_in_pages_readable(buf, seglen);
+	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
 	return fault_in_pages_readable(buf, bytes);
 }
 /*
@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 }
 EXPORT_SYMBOL(generic_write_checks);
 int pagecache_write_begin(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned flags,
 				struct page **pagep, void **fsdata)
 {
 	const struct address_space_operations *aops = mapping->a_ops;
 	if (aops->write_begin) {
 		return aops->write_begin(file, mapping, pos, len, flags,
 							pagep, fsdata);
 	} else {
 		int ret;
 		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 		struct inode *inode = mapping->host;
 		struct page *page;
 again:
 		page = __grab_cache_page(mapping, index);
 		*pagep = page;
 		if (!page)
 			return -ENOMEM;
 		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
 			/*
 			 * There is no way to resolve a short write situation
 			 * for a !Uptodate page (except by double copying in
 			 * the caller done by generic_perform_write_2copy).
 			 *
 			 * Instead, we have to bring it uptodate here.
 			 */
 			ret = aops->readpage(file, page);
 			page_cache_release(page);
 			if (ret) {
 				if (ret == AOP_TRUNCATED_PAGE)
 					goto again;
 				return ret;
 			}
 			goto again;
 		}
 		ret = aops->prepare_write(file, page, offset, offset+len);
 		if (ret) {
 			if (ret != AOP_TRUNCATED_PAGE)
 				unlock_page(page);
 			page_cache_release(page);
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 			if (ret == AOP_TRUNCATED_PAGE)
 				goto again;
 		}
 		return ret;
 	}
 }
 EXPORT_SYMBOL(pagecache_write_begin);
 int pagecache_write_end(struct file *file, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata)
 {
 	const struct address_space_operations *aops = mapping->a_ops;
 	int ret;
 	if (aops->write_end) {
 		mark_page_accessed(page);
 		ret = aops->write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	} else {
 		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
 		struct inode *inode = mapping->host;
 		flush_dcache_page(page);
 		ret = aops->commit_write(file, page, offset, offset+len);
 		unlock_page(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
 		BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */
 		if (ret < 0) {
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		} else if (ret > 0)
 			ret = min_t(size_t, copied, ret);
 		else
 			ret = copied;
 	}
 	return ret;
 }
 EXPORT_SYMBOL(pagecache_write_end);
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
 * Find or create a page at the given pagecache position. Return the locked
 * page. This function is specifically for buffered writes.
 */
-static struct page *__grab_cache_page(struct address_space *mapping,
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 							pgoff_t index)
 {
 	int status;
 	struct page *page;
@ -1908,20 +2005,16 @@ repeat:
 	}
 	return page;
 }
 EXPORT_SYMBOL(__grab_cache_page);
-ssize_t
+static ssize_t generic_perform_write_2copy(struct file *file,
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+				struct iov_iter *i, loff_t pos)
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
 		size_t count, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode *inode = mapping->host;
 	long status = 0;
-	struct iov_iter i;
+	ssize_t written = 0;
 	iov_iter_init(&i, iov, nr_segs, count, written);
 	do {
 		struct page *src_page;
@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(&i));
+						iov_iter_count(i));
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(iov_iter_fault_in_readable(&i))) {
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
 			break;
 		}
@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = iov_iter_copy_from_user(src_page, &i,
+			copied = iov_iter_copy_from_user(src_page, i,
 								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				page_cache_release(src_page);
 				continue;
 			}
 		}
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, &i,
+			copied = iov_iter_copy_from_user_atomic(page, i,
 								offset, bytes);
 			pagefault_enable();
 		} else {
@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		if (src_page)
 			page_cache_release(src_page);
-		iov_iter_advance(&i, copied);
+		iov_iter_advance(i, copied);
 		written += copied;
 		pos += copied;
 		written += copied;
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
@ -2082,13 +2174,117 @@ fs_write_aop_error:
 			continue;
 		else
 			break;
-	} while (iov_iter_count(&i));
+	} while (iov_iter_count(i));
-	*ppos = pos;
+
 	return written ? written : status;
 }
 static ssize_t generic_perform_write(struct file *file,
 				struct iov_iter *i, loff_t pos)
 {
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	long status = 0;
 	ssize_t written = 0;
 	do {
 		struct page *page;
 		pgoff_t index;		/* Pagecache index for current page */
 		unsigned long offset;	/* Offset into pagecache page */
 		unsigned long bytes;	/* Bytes to write to page */
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata;
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
 						iov_iter_count(i));
 again:
 		/*
-	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
 		 *
 		 * Not only is this an optimisation, but it is also required
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
 			break;
 		}
 		status = a_ops->write_begin(file, mapping, pos, bytes, 0,
 						&page, &fsdata);
 		if (unlikely(status))
 			break;
 		pagefault_disable();
 		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
 		pagefault_enable();
 		flush_dcache_page(page);
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
 		copied = status;
 		cond_resched();
 		if (unlikely(copied == 0)) {
 			/*
 			 * If we were unable to copy any data at all, we must
 			 * fall back to a single segment length write.
 			 *
 			 * If we didn't fallback here, we could livelock
 			 * because not all segments in the iov can be copied at
 			 * once without a pagefault.
 			 */
 			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
 						iov_iter_single_seg_count(i));
 			goto again;
 		}
 		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(i));
 	return written ? written : status;
 }
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
 		size_t count, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode *inode = mapping->host;
 	ssize_t status;
 	struct iov_iter i;
 	iov_iter_init(&i, iov, nr_segs, count, written);
 	if (a_ops->write_begin)
 		status = generic_perform_write(file, &i, pos);
 	else
 		status = generic_perform_write_2copy(file, &i, pos);
 	if (likely(status >= 0)) {
 		written += status;
 		*ppos = pos + status;
 		/*
 		 * For now, when the user asks for O_SYNC, we'll actually give
 		 * O_DSYNC
 		 */
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
 				status = generic_osync_inode(inode, mapping,