xfs: update for 4.8-rc1

Changes in this update: o generic iomap based IO path infrastructure o generic iomap based fiemap implementation o xfs iomap based Io path implementation o buffer error handling fixes o tracking of in flight buffer IO for unmount serialisation o direct IO and DAX io path separation and simplification o shortform directory format definition changes for wider platform compatibility o various buffer cache fixes o cleanups in preparation for rmap merge o error injection cleanups and fixes o log item format buffer memory allocation restructuring to prevent rare OOM reclaim deadlocks o sparse inode chunks are now fully supported. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJXmA5XAAoJEK3oKUf0dfodCc0QAKY5Jlfw5HwLria+Ad87HCcM Zi/LGMMC3CPh+vkbqsmDnLKHYjXRwi3HamBoXdufiE8E3UtOjp/sV98/fCw+zwhe tHDLmdAx23RLTn7gUhcsIXydKeXh0+HlRxPa4eBAlmnsJ3nGgrKrKQLgDT7Gjlum nPfRSTYjzm5gs2dpUTYhMV7MplenDW9GFz2uBMct6N9kYQ9m225I99fd/4nb/L7R o/8UocsK7iREUXP6decDoN9uIAzE2mYR720EL+Txy09CTYy+luNyGoNXOsQtxT5O plyoPZbzIIDvC44bvp6bZX96Udm7tAeTloieInCZG13I2zJy9gmTmLqkZ3M2at12 kOyeAMSBOWQYSa3uh++FsEP+JGtBTlZXf+4DAYf+U08s8tMVE/61/RZrtJZF4OjW hyumRBD6zqZ9Y6Qtji2HaA3l9IGxOC2k4URw9JZdDDyMoRTQvawN1QWNAeZINXiv 9ywqTruVsfQnoGDC1Gk1OEfQpubNztTAkEPqVM7ez5dkwOdwuOZXcZPL1Ltvb4Bt PLaWKLIYFYZKrM5kqgQlTERspSQA99++z8H9a21wFezfetaBby28fIqwMMfQAiSw nCq95WshJPwenogMtWjNfOgs/fqOBKdPdLFw0H6Jpmjwna2KpuFIZiTnwu25vvjz dHh4DVSuMTq1pBkXEU7B =vcSd -----END PGP SIGNATURE----- Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs Pull xfs updates from Dave Chinner: "The major addition is the new iomap based block mapping infrastructure. We've been kicking this about locally for years, but there are other filesystems want to use it too (e.g. gfs2). Now it is fully working, reviewed and ready for merge and be used by other filesystems. There are a lot of other fixes and cleanups in the tree, but those are XFS internal things and none are of the scale or visibility of the iomap changes. See below for details. I am likely to send another pull request next week - we're just about ready to merge some new functionality (on disk block->owner reverse mapping infrastructure), but that's a huge chunk of code (74 files changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that separate to all the "normal" pull request changes so they don't get lost in the noise. Summary of changes in this update: - generic iomap based IO path infrastructure - generic iomap based fiemap implementation - xfs iomap based Io path implementation - buffer error handling fixes - tracking of in flight buffer IO for unmount serialisation - direct IO and DAX io path separation and simplification - shortform directory format definition changes for wider platform compatibility - various buffer cache fixes - cleanups in preparation for rmap merge - error injection cleanups and fixes - log item format buffer memory allocation restructuring to prevent rare OOM reclaim deadlocks - sparse inode chunks are now fully supported" * tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits) xfs: remove EXPERIMENTAL tag from sparse inode feature xfs: bufferhead chains are invalid after end_page_writeback xfs: allocate log vector buffers outside CIL context lock libxfs: directory node splitting does not have an extra block xfs: remove dax code from object file when disabled xfs: skip dirty pages in ->releasepage() xfs: remove __arch_pack xfs: kill xfs_dir2_inou_t xfs: kill xfs_dir2_sf_off_t xfs: split direct I/O and DAX path xfs: direct calls in the direct I/O path xfs: stop using generic_file_read_iter for direct I/O xfs: split xfs_file_read_iter into buffered and direct I/O helpers xfs: remove s_maxbytes enforcement in xfs_file_read_iter xfs: kill ioflags xfs: don't pass ioflags around in the ioctl path xfs: track and serialize in-flight async buffers against unmount xfs: exclude never-released buffers from buftarg I/O accounting xfs: don't reset b_retries to 0 on every failure xfs: remove extraneous buffer flag changes ...
2016-07-27 09:53:35 -07:00 · 2016-07-27 09:53:35 -07:00 · 0e6acf0204
commit 0e6acf0204
parent 0e06f5c0de f2bdfda9a1
66 changed files with 2038 additions and 1452 deletions
--- a/fs/Kconfig
+++ b/fs/Kconfig
@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS

 if BLOCK

+config FS_IOMAP
+	bool
+
 source "fs/ext2/Kconfig"
 source "fs/ext4/Kconfig"
 source "fs/jbd2/Kconfig"
--- a/fs/Makefile
+++ b/fs/Makefile
@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP)		+= coredump.o
 obj-$(CONFIG_SYSCTL)		+= drop_caches.o

 obj-$(CONFIG_FHANDLE)		+= fhandle.o
+obj-$(CONFIG_FS_IOMAP)		+= iomap.o

 obj-y				+= quota/

--- a/fs/buffer.c
+++ b/fs/buffer.c
@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/mm.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 }
 EXPORT_SYMBOL(page_zero_new_buffers);

-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
-		get_block_t *get_block)
+static void
+iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
+		struct iomap *iomap)
+{
+	loff_t offset = block << inode->i_blkbits;
+
+	bh->b_bdev = iomap->bdev;
+
+	/*
+	 * Block points to offset in file we need to map, iomap contains
+	 * the offset at which the map starts. If the map ends before the
+	 * current block, then do not map the buffer and let the caller
+	 * handle it.
+	 */
+	BUG_ON(offset >= iomap->offset + iomap->length);
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/*
+		 * If the buffer is not up to date or beyond the current EOF,
+		 * we need to mark it as new to ensure sub-block zeroing is
+		 * executed if necessary.
+		 */
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		break;
+	case IOMAP_DELALLOC:
+		if (!buffer_uptodate(bh) ||
+		    (offset >= i_size_read(inode)))
+			set_buffer_new(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_mapped(bh);
+		set_buffer_delay(bh);
+		break;
+	case IOMAP_UNWRITTEN:
+		/*
+		 * For unwritten regions, we always need to ensure that
+		 * sub-block writes cause the regions in the block we are not
+		 * writing to are zeroed. Set the buffer as new to ensure this.
+		 */
+		set_buffer_new(bh);
+		set_buffer_unwritten(bh);
+		/* FALLTHRU */
+	case IOMAP_MAPPED:
+		if (offset >= i_size_read(inode))
+			set_buffer_new(bh);
+		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
+				((offset - iomap->offset) >> inode->i_blkbits);
+		set_buffer_mapped(bh);
+		break;
+	}
+}
+
+int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap)
 {
 	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {
 			WARN_ON(bh->b_size != blocksize);
-			err = get_block(inode, block, bh, 1);
-			if (err)
-				break;
+			if (get_block) {
+				err = get_block(inode, block, bh, 1);
+				if (err)
+					break;
+			} else {
+				iomap_to_bh(inode, block, bh, iomap);
+			}
+
 			if (buffer_new(bh)) {
 				unmap_underlying_metadata(bh->b_bdev,
 							bh->b_blocknr);
@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		page_zero_new_buffers(page, from, to);
 	return err;
 }
+
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block)
+{
+	return __block_write_begin_int(page, pos, len, get_block, NULL);
+}
 EXPORT_SYMBOL(__block_write_begin);

 static int __block_commit_write(struct inode *inode, struct page *page,
--- a/fs/internal.h
+++ b/fs/internal.h
@ -11,6 +11,7 @@

 struct super_block;
 struct file_system_type;
+struct iomap;
 struct linux_binprm;
 struct path;
 struct mount;
@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 * buffer.c
 */
 extern void guard_bio_eod(int rw, struct bio *bio);
+extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
+		get_block_t *get_block, struct iomap *iomap);

 /*
 * char_dev.c
--- a/fs/iomap.c
+++ b/fs/iomap.c
@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016 Christoph Hellwig.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/dax.h>
+#include "internal.h"
+
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+	struct iomap iomap = { 0 };
+	loff_t written = 0, ret;
+
+	/*
+	 * Need to map a range from start position for length bytes. This can
+	 * span multiple pages - it is only guaranteed to return a range of a
+	 * single type of pages (e.g. all into a hole, all mapped or all
+	 * unwritten). Failure at this point has nothing to undo.
+	 *
+	 * If allocation is required for this range, reserve the space now so
+	 * that the allocation is guaranteed to succeed later on. Once we copy
+	 * the data into the page cache pages, then we cannot fail otherwise we
+	 * expose transient stale data. If the reserve fails, we can safely
+	 * back out at this point as there is nothing to undo.
+	 */
+	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+	if (ret)
+		return ret;
+	if (WARN_ON(iomap.offset > pos))
+		return -EIO;
+
+	/*
+	 * Cut down the length to the one actually provided by the filesystem,
+	 * as it might not be able to give us the whole size that we requested.
+	 */
+	if (iomap.offset + iomap.length < pos + length)
+		length = iomap.offset + iomap.length - pos;
+
+	/*
+	 * Now that we have guaranteed that the space allocation will succeed.
+	 * we can do the copy-in page by page without having to worry about
+	 * failures exposing transient data.
+	 */
+	written = actor(inode, pos, length, data, &iomap);
+
+	/*
+	 * Now the data has been copied, commit the range we've copied.  This
+	 * should not fail unless the filesystem has had a fatal error.
+	 */
+	ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
+			flags, &iomap);
+
+	return written ? written : ret;
+}
+
+static void
+iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
+{
+	loff_t i_size = i_size_read(inode);
+
+	/*
+	 * Only truncate newly allocated pages beyoned EOF, even if the
+	 * write started inside the existing inode size.
+	 */
+	if (pos + len > i_size)
+		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+}
+
+static int
+iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, struct iomap *iomap)
+{
+	pgoff_t index = pos >> PAGE_SHIFT;
+	struct page *page;
+	int status = 0;
+
+	BUG_ON(pos + len > iomap->offset + iomap->length);
+
+	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+
+	status = __block_write_begin_int(page, pos, len, NULL, iomap);
+	if (unlikely(status)) {
+		unlock_page(page);
+		put_page(page);
+		page = NULL;
+
+		iomap_write_failed(inode, pos, len);
+	}
+
+	*pagep = page;
+	return status;
+}
+
+static int
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+		unsigned copied, struct page *page)
+{
+	int ret;
+
+	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
+			copied, page, NULL);
+	if (ret < len)
+		iomap_write_failed(inode, pos, len);
+	return ret;
+}
+
+static loff_t
+iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct iov_iter *i = data;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = AOP_FLAG_NOFS;
+
+	/*
+	 * Copies from kernel address space cannot fail (NFSD is a big user).
+	 */
+	if (!iter_is_iovec(i))
+		flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	do {
+		struct page *page;
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+
+		offset = (pos & (PAGE_SIZE - 1));
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_count(i));
+again:
+		if (bytes > length)
+			bytes = length;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = iomap_write_begin(inode, pos, bytes, flags, &page,
+				iomap);
+		if (unlikely(status))
+			break;
+
+		if (mapping_writably_mapped(inode->i_mapping))
+			flush_dcache_page(page);
+
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		pagefault_enable();
+
+		flush_dcache_page(page);
+		mark_page_accessed(page);
+
+		status = iomap_write_end(inode, pos, bytes, copied, page);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		pos += copied;
+		written += copied;
+		length -= copied;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+	} while (iov_iter_count(i) && length);
+
+	return written ? written : status;
+}
+
+ssize_t
+iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
+		struct iomap_ops *ops)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
+
+	while (iov_iter_count(iter)) {
+		ret = iomap_apply(inode, pos, iov_iter_count(iter),
+				IOMAP_WRITE, ops, iter, iomap_write_actor);
+		if (ret <= 0)
+			break;
+		pos += ret;
+		written += ret;
+	}
+
+	return written ? written : ret;
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
+		unsigned bytes, struct iomap *iomap)
+{
+	struct page *page;
+	int status;
+
+	status = iomap_write_begin(inode, pos, bytes,
+			AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+	if (status)
+		return status;
+
+	zero_user(page, offset, bytes);
+	mark_page_accessed(page);
+
+	return iomap_write_end(inode, pos, bytes, bytes, page);
+}
+
+static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
+		struct iomap *iomap)
+{
+	sector_t sector = iomap->blkno +
+		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
+
+	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+}
+
+static loff_t
+iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
+		void *data, struct iomap *iomap)
+{
+	bool *did_zero = data;
+	loff_t written = 0;
+	int status;
+
+	/* already zeroed?  we're done. */
+	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+	    	return count;
+
+	do {
+		unsigned offset, bytes;
+
+		offset = pos & (PAGE_SIZE - 1); /* Within page */
+		bytes = min_t(unsigned, PAGE_SIZE - offset, count);
+
+		if (IS_DAX(inode))
+			status = iomap_dax_zero(pos, offset, bytes, iomap);
+		else
+			status = iomap_zero(inode, pos, offset, bytes, iomap);
+		if (status < 0)
+			return status;
+
+		pos += bytes;
+		count -= bytes;
+		written += bytes;
+		if (did_zero)
+			*did_zero = true;
+	} while (count > 0);
+
+	return written;
+}
+
+int
+iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	loff_t ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
+				ops, did_zero, iomap_zero_range_actor);
+		if (ret <= 0)
+			return ret;
+
+		pos += ret;
+		len -= ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_zero_range);
+
+int
+iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops)
+{
+	unsigned blocksize = (1 << inode->i_blkbits);
+	unsigned off = pos & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!off)
+		return 0;
+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+}
+EXPORT_SYMBOL_GPL(iomap_truncate_page);
+
+static loff_t
+iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct page *page = data;
+	int ret;
+
+	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
+			NULL, iomap);
+	if (ret)
+		return ret;
+
+	block_commit_write(page, 0, length);
+	return length;
+}
+
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = file_inode(vma->vm_file);
+	unsigned long length;
+	loff_t offset, size;
+	ssize_t ret;
+
+	lock_page(page);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_offset(page) > size)) {
+		/* We overload EFAULT to mean page got truncated */
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_SHIFT) > size)
+		length = size & ~PAGE_MASK;
+	else
+		length = PAGE_SIZE;
+
+	offset = page_offset(page);
+	while (length > 0) {
+		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
+				ops, page, iomap_page_mkwrite_actor);
+		if (unlikely(ret <= 0))
+			goto out_unlock;
+		offset += ret;
+		length -= ret;
+	}
+
+	set_page_dirty(page);
+	wait_for_stable_page(page);
+	return 0;
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+struct fiemap_ctx {
+	struct fiemap_extent_info *fi;
+	struct iomap prev;
+};
+
+static int iomap_to_fiemap(struct fiemap_extent_info *fi,
+		struct iomap *iomap, u32 flags)
+{
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		/* skip holes */
+		return 0;
+	case IOMAP_DELALLOC:
+		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
+		break;
+	case IOMAP_UNWRITTEN:
+		flags |= FIEMAP_EXTENT_UNWRITTEN;
+		break;
+	case IOMAP_MAPPED:
+		break;
+	}
+
+	return fiemap_fill_next_extent(fi, iomap->offset,
+			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
+			iomap->length, flags | FIEMAP_EXTENT_MERGED);
+
+}
+
+static loff_t
+iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+		struct iomap *iomap)
+{
+	struct fiemap_ctx *ctx = data;
+	loff_t ret = length;
+
+	if (iomap->type == IOMAP_HOLE)
+		return length;
+
+	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
+	ctx->prev = *iomap;
+	switch (ret) {
+	case 0:		/* success */
+		return length;
+	case 1:		/* extent array full */
+		return 0;
+	default:
+		return ret;
+	}
+}
+
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
+		loff_t start, loff_t len, struct iomap_ops *ops)
+{
+	struct fiemap_ctx ctx;
+	loff_t ret;
+
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.fi = fi;
+	ctx.prev.type = IOMAP_HOLE;
+
+	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		return ret;
+
+	while (len > 0) {
+		ret = iomap_apply(inode, start, len, 0, ops, &ctx,
+				iomap_fiemap_actor);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		start += ret;
+		len -= ret;
+	}
+
+	if (ctx.prev.type != IOMAP_HOLE) {
+		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_fiemap);
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@ -2,6 +2,7 @@
 * Copyright (c) 2014-2016 Christoph Hellwig.
 */
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/pr.h>
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@ -3,6 +3,7 @@
 */
 #include <linux/sunrpc/svc.h>
 #include <linux/exportfs.h>
+#include <linux/iomap.h>
 #include <linux/nfs4.h>

 #include "nfsd.h"
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@ -4,6 +4,7 @@ config XFS_FS
 	depends on (64BIT || LBDAF)
 	select EXPORTFS
 	select LIBCRC32C
+	select FS_IOMAP
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
 * Lookup the first record less than or equal to [bno, len]
 * in the btree given by cur.
 */
-int					/* error */
+static int				/* error */
 xfs_alloc_lookup_le(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		bno,	/* starting block of extent */
@ -1839,19 +1839,8 @@ void
 xfs_alloc_compute_maxlevels(
 	xfs_mount_t	*mp)	/* file system mount structure */
 {
-	int		level;
-	uint		maxblocks;
-	uint		maxleafents;
-	int		minleafrecs;
-	int		minnoderecs;
-
-	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
-	minleafrecs = mp->m_alloc_mnr[0];
-	minnoderecs = mp->m_alloc_mnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++)
-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	mp->m_ag_maxlevels = level;
+	mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
+			(mp->m_sb.sb_agblocks + 1) / 2);
 }

 /*
@ -2658,55 +2647,79 @@ error0:
 	return error;
 }

-/*
- * Free an extent.
- * Just break up the extent address and hand off to xfs_free_ag_extent
- * after fixing up the freelist.
- */
-int				/* error */
-xfs_free_extent(
-	xfs_trans_t	*tp,	/* transaction pointer */
-	xfs_fsblock_t	bno,	/* starting block number of extent */
-	xfs_extlen_t	len)	/* length of extent */
+/* Ensure that the freelist is at full capacity. */
+int
+xfs_free_extent_fix_freelist(
+	struct xfs_trans	*tp,
+	xfs_agnumber_t		agno,
+	struct xfs_buf		**agbp)
 {
-	xfs_alloc_arg_t	args;
-	int		error;
+	struct xfs_alloc_arg	args;
+	int			error;

-	ASSERT(len != 0);
-	memset(&args, 0, sizeof(xfs_alloc_arg_t));
+	memset(&args, 0, sizeof(struct xfs_alloc_arg));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
+	args.agno = agno;

 	/*
 	 * validate that the block number is legal - the enables us to detect
 	 * and handle a silent filesystem corruption rather than crashing.
 	 */
-	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
 	if (args.agno >= args.mp->m_sb.sb_agcount)
 		return -EFSCORRUPTED;

-	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
-	if (args.agbno >= args.mp->m_sb.sb_agblocks)
-		return -EFSCORRUPTED;
-
 	args.pag = xfs_perag_get(args.mp, args.agno);
 	ASSERT(args.pag);

 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
 	if (error)
-		goto error0;
+		goto out;

-	/* validate the extent size is legal now we have the agf locked */
-	if (args.agbno + len >
-			be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-		error = -EFSCORRUPTED;
-		goto error0;
-	}
-
-	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
-	if (!error)
-		xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
-error0:
+	*agbp = args.agbp;
+out:
 	xfs_perag_put(args.pag);
 	return error;
 }
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		bno,	/* starting block number of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_buf		*agbp;
+	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(mp, bno);
+	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	int			error;
+
+	ASSERT(len != 0);
+
+	error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
+	if (error)
+		return error;
+
+	XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
+
+	/* validate the extent size is legal now we have the agf locked */
+	XFS_WANT_CORRUPTED_GOTO(mp,
+		agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
+				err);
+
+	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
+	if (error)
+		goto err;
+
+	xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+	return 0;
+
+err:
+	xfs_trans_brelse(tp, agbp);
+	return error;
+}
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@ -212,13 +212,6 @@ xfs_free_extent(
 	xfs_fsblock_t	bno,	/* starting block number of extent */
 	xfs_extlen_t	len);	/* length of extent */

-int					/* error */
-xfs_alloc_lookup_le(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		bno,	/* starting block of extent */
-	xfs_extlen_t		len,	/* length of extent */
-	int			*stat);	/* success/failure */
-
 int				/* error */
 xfs_alloc_lookup_ge(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
@ -236,5 +229,7 @@ xfs_alloc_get_rec(
 int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
+int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
+		struct xfs_buf **agbp);

 #endif	/* __XFS_ALLOC_H__ */
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@ -50,7 +50,6 @@ int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
 int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
-int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
 int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@ -88,8 +87,6 @@ int	xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
 void	xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
 				       struct xfs_da_state_blk *drop_blk,
 				       struct xfs_da_state_blk *save_blk);
-int	xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
-
 /*
 * Utility routines.
 */
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
 */
 void
 xfs_bmap_add_free(
+	struct xfs_mount	*mp,		/* mount point structure */
+	struct xfs_bmap_free	*flist,		/* list of extents */
 	xfs_fsblock_t		bno,		/* fs block number of extent */
-	xfs_filblks_t		len,		/* length of extent */
-	xfs_bmap_free_t		*flist,		/* list of extents */
-	xfs_mount_t		*mp)		/* mount point structure */
+	xfs_filblks_t		len)		/* length of extent */
 {
-	xfs_bmap_free_item_t	*cur;		/* current (next) element */
-	xfs_bmap_free_item_t	*new;		/* new element */
-	xfs_bmap_free_item_t	*prev;		/* previous element */
+	struct xfs_bmap_free_item	*new;		/* new element */
 #ifdef DEBUG
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
@ -597,17 +595,7 @@ xfs_bmap_add_free(
 	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
 	new->xbfi_startblock = bno;
 	new->xbfi_blockcount = (xfs_extlen_t)len;
-	for (prev = NULL, cur = flist->xbf_first;
-	     cur != NULL;
-	     prev = cur, cur = cur->xbfi_next) {
-		if (cur->xbfi_startblock >= bno)
-			break;
-	}
-	if (prev)
-		prev->xbfi_next = new;
-	else
-		flist->xbf_first = new;
-	new->xbfi_next = cur;
+	list_add(&new->xbfi_list, &flist->xbf_flist);
 	flist->xbf_count++;
 }

@ -617,14 +605,10 @@ xfs_bmap_add_free(
 */
 void
 xfs_bmap_del_free(
-	xfs_bmap_free_t		*flist,	/* free item list header */
-	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
-	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+	struct xfs_bmap_free		*flist,	/* free item list header */
+	struct xfs_bmap_free_item	*free)	/* list item to be freed */
 {
-	if (prev)
-		prev->xbfi_next = free->xbfi_next;
-	else
-		flist->xbf_first = free->xbfi_next;
+	list_del(&free->xbfi_list);
 	flist->xbf_count--;
 	kmem_zone_free(xfs_bmap_free_item_zone, free);
 }
@ -634,17 +618,16 @@ xfs_bmap_del_free(
 */
 void
 xfs_bmap_cancel(
-	xfs_bmap_free_t		*flist)	/* list of bmap_free_items */
+	struct xfs_bmap_free		*flist)	/* list of bmap_free_items */
 {
-	xfs_bmap_free_item_t	*free;	/* free list item */
-	xfs_bmap_free_item_t	*next;
+	struct xfs_bmap_free_item	*free;	/* free list item */

 	if (flist->xbf_count == 0)
 		return;
-	ASSERT(flist->xbf_first != NULL);
-	for (free = flist->xbf_first; free; free = next) {
-		next = free->xbfi_next;
-		xfs_bmap_del_free(flist, NULL, free);
+	while (!list_empty(&flist->xbf_flist)) {
+		free = list_first_entry(&flist->xbf_flist,
+				struct xfs_bmap_free_item, xbfi_list);
+		xfs_bmap_del_free(flist, free);
 	}
 	ASSERT(flist->xbf_count == 0);
 }
@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
 	cblock = XFS_BUF_TO_BLOCK(cbp);
 	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
 		return error;
-	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
 	ip->i_d.di_nblocks--;
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
 	xfs_trans_binval(tp, cbp);
@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
 	 * If we need to, add to list of extents to delete.
 	 */
 	if (do_fx)
-		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
-			mp);
+		xfs_bmap_add_free(mp, flist, del->br_startblock,
+			del->br_blockcount);
 	/*
 	 * Adjust inode # blocks in the file.
 	 */
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@ -62,12 +62,12 @@ struct xfs_bmalloca {
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
-typedef struct xfs_bmap_free_item
+struct xfs_bmap_free_item
 {
 	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
 	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
-	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
-} xfs_bmap_free_item_t;
+	struct list_head	xbfi_list;
+};

 /*
 * Header for free extent list.
@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
 */
 typedef	struct xfs_bmap_free
 {
-	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
+	struct list_head	xbf_flist;	/* list of to-be-free extents */
 	int			xbf_count;	/* count of items on list */
 	int			xbf_low;	/* alloc in low mode */
 } xfs_bmap_free_t;
@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)

 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
-	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
-		(flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK);
+	INIT_LIST_HEAD(&flp->xbf_flist);
+	flp->xbf_count = 0;
+	flp->xbf_low = 0;
+	*fbp = NULLFSBLOCK;
 }

 /*
@ -191,8 +193,8 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,

 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
-		struct xfs_bmap_free *flist, struct xfs_mount *mp);
+void	xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
+			  xfs_fsblock_t bno, xfs_filblks_t len);
 void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
 int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
 			struct xfs_inode *ip);
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@ -526,7 +526,7 @@ xfs_bmbt_free_block(
 	struct xfs_trans	*tp = cur->bc_tp;
 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));

-	xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+	xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
 	ip->i_d.di_nblocks--;

 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
 */
 STATIC struct xfs_btree_block *
 xfs_btree_get_iroot(
-       struct xfs_btree_cur    *cur)
+	struct xfs_btree_cur	*cur)
 {
-       struct xfs_ifork        *ifp;
+	struct xfs_ifork	*ifp;

-       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
-       return (struct xfs_btree_block *)ifp->if_broot;
+	ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+	return (struct xfs_btree_block *)ifp->if_broot;
 }

 /*
@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(

 	return true;
 }
+
+/*
+ * Calculate the number of btree levels needed to store a given number of
+ * records in a short-format btree.
+ */
+uint
+xfs_btree_compute_maxlevels(
+	struct xfs_mount	*mp,
+	uint			*limits,
+	unsigned long		len)
+{
+	uint			level;
+	unsigned long		maxblocks;
+
+	maxblocks = (len + limits[0] - 1) / limits[0];
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + limits[1] - 1) / limits[1];
+	return level;
+}
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)

 bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
 bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
+				 unsigned long len);

 #endif	/* __XFS_BTREE_H__ */
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@ -356,7 +356,6 @@ xfs_da3_split(
 	struct xfs_da_state_blk	*newblk;
 	struct xfs_da_state_blk	*addblk;
 	struct xfs_da_intnode	*node;
-	struct xfs_buf		*bp;
 	int			max;
 	int			action = 0;
 	int			error;
@ -397,7 +396,9 @@ xfs_da3_split(
 				break;
 			}
 			/*
-			 * Entry wouldn't fit, split the leaf again.
+			 * Entry wouldn't fit, split the leaf again. The new
+			 * extrablk will be consumed by xfs_da3_node_split if
+			 * the node is split.
 			 */
 			state->extravalid = 1;
 			if (state->inleaf) {
@ -445,6 +446,14 @@ xfs_da3_split(
 	if (!addblk)
 		return 0;

+	/*
+	 * xfs_da3_node_split() should have consumed any extra blocks we added
+	 * during a double leaf split in the attr fork. This is guaranteed as
+	 * we can't be here if the attr fork only has a single leaf block.
+	 */
+	ASSERT(state->extravalid == 0 ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
+
 	/*
 	 * Split the root node.
 	 */
@ -457,43 +466,33 @@ xfs_da3_split(
 	}

 	/*
-	 * Update pointers to the node which used to be block 0 and
-	 * just got bumped because of the addition of a new root node.
-	 * There might be three blocks involved if a double split occurred,
-	 * and the original block 0 could be at any position in the list.
+	 * Update pointers to the node which used to be block 0 and just got
+	 * bumped because of the addition of a new root node.  Note that the
+	 * original block 0 could be at any position in the list of blocks in
+	 * the tree.
 	 *
-	 * Note: the magic numbers and sibling pointers are in the same
-	 * physical place for both v2 and v3 headers (by design). Hence it
-	 * doesn't matter which version of the xfs_da_intnode structure we use
-	 * here as the result will be the same using either structure.
+	 * Note: the magic numbers and sibling pointers are in the same physical
+	 * place for both v2 and v3 headers (by design). Hence it doesn't matter
+	 * which version of the xfs_da_intnode structure we use here as the
+	 * result will be the same using either structure.
 	 */
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.forw) {
-		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
-			bp = addblk->bp;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
+		ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
+		node = addblk->bp->b_addr;
 		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
-		    sizeof(node->hdr.info)));
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
+				  sizeof(node->hdr.info)));
 	}
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.back) {
-		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
-			bp = addblk->bp;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
+		ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
+		node = addblk->bp->b_addr;
 		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
-		    XFS_DA_LOGRANGE(node, &node->hdr.info,
-		    sizeof(node->hdr.info)));
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
+				  XFS_DA_LOGRANGE(node, &node->hdr.info,
+				  sizeof(node->hdr.info)));
 	}
 	addblk->bp = NULL;
 	return 0;
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
 	int count = sizeof(struct xfs_dir2_sf_entry);	/* namelen + offset */

 	count += len;					/* name */
-	count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
-				sizeof(xfs_dir2_ino4_t); /* ino # */
+	count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
 	return count;
 }

@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
 static xfs_ino_t
 xfs_dir2_sf_get_ino(
 	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*from)
+	__uint8_t		*from)
 {
 	if (hdr->i8count)
-		return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+		return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
 	else
-		return get_unaligned_be32(&from->i4.i);
+		return get_unaligned_be32(from);
 }

 static void
 xfs_dir2_sf_put_ino(
 	struct xfs_dir2_sf_hdr	*hdr,
-	xfs_dir2_inou_t		*to,
+	__uint8_t		*to,
 	xfs_ino_t		ino)
 {
 	ASSERT((ino & 0xff00000000000000ULL) == 0);

 	if (hdr->i8count)
-		put_unaligned_be64(ino, &to->i8.i);
+		put_unaligned_be64(ino, to);
 	else
-		put_unaligned_be32(ino, &to->i4.i);
+		put_unaligned_be32(ino, to);
 }

 static xfs_ino_t
 xfs_dir2_sf_get_parent_ino(
 	struct xfs_dir2_sf_hdr	*hdr)
 {
-	return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+	return xfs_dir2_sf_get_ino(hdr, hdr->parent);
 }

 static void
@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
 	struct xfs_dir2_sf_hdr	*hdr,
 	xfs_ino_t		ino)
 {
-	xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+	xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
 }

 /*
@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
 	struct xfs_dir2_sf_hdr	*hdr,
 	struct xfs_dir2_sf_entry *sfep)
 {
-	return xfs_dir2_sf_get_ino(hdr,
-				(xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+	return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
 }

 static void
@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
 	struct xfs_dir2_sf_entry *sfep,
 	xfs_ino_t		ino)
 {
-	xfs_dir2_sf_put_ino(hdr,
-			    (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+	xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
 }

 static xfs_ino_t
@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
 	struct xfs_dir2_sf_hdr	*hdr,
 	struct xfs_dir2_sf_entry *sfep)
 {
-	return xfs_dir2_sf_get_ino(hdr,
-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+	return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
 }

 static void
@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
 	struct xfs_dir2_sf_entry *sfep,
 	xfs_ino_t		ino)
 {
-	xfs_dir2_sf_put_ino(hdr,
-			(xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+	xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
 }


--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@ -191,12 +191,6 @@ typedef	__uint16_t	xfs_dir2_data_off_t;
 #define	NULLDATAOFF	0xffffU
 typedef uint		xfs_dir2_data_aoff_t;	/* argument form */

-/*
- * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
- * Only need 16 bits, this is the byte offset into the single block form.
- */
-typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
-
 /*
 * Offset in data space of a data entry.
 */
@ -214,22 +208,10 @@ typedef	xfs_off_t	xfs_dir2_off_t;
 */
 typedef	__uint32_t	xfs_dir2_db_t;

-/*
- * Inode number stored as 8 8-bit values.
- */
-typedef	struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+#define XFS_INO32_SIZE	4
+#define XFS_INO64_SIZE	8
+#define XFS_INO64_DIFF	(XFS_INO64_SIZE - XFS_INO32_SIZE)

-/*
- * Inode number stored as 4 8-bit values.
- * Works a lot of the time, when all the inode numbers in a directory
- * fit in 32 bits.
- */
-typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
-
-typedef union {
-	xfs_dir2_ino8_t	i8;
-	xfs_dir2_ino4_t	i4;
-} xfs_dir2_inou_t;
 #define	XFS_DIR2_MAX_SHORT_INUM	((xfs_ino_t)0xffffffffULL)

 /*
@ -246,39 +228,38 @@ typedef union {
 typedef struct xfs_dir2_sf_hdr {
 	__uint8_t		count;		/* count of entries */
 	__uint8_t		i8count;	/* count of 8-byte inode #s */
-	xfs_dir2_inou_t		parent;		/* parent dir inode number */
-} __arch_pack xfs_dir2_sf_hdr_t;
+	__uint8_t		parent[8];	/* parent dir inode number */
+} __packed xfs_dir2_sf_hdr_t;

 typedef struct xfs_dir2_sf_entry {
 	__u8			namelen;	/* actual name length */
-	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__u8			offset[2];	/* saved offset */
 	__u8			name[];		/* name, variable size */
 	/*
 	 * A single byte containing the file type field follows the inode
 	 * number for version 3 directory entries.
 	 *
-	 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
-	 * variable offset after the name.
+	 * A 64-bit or 32-bit inode number follows here, at a variable offset
+	 * after the name.
 	 */
-} __arch_pack xfs_dir2_sf_entry_t;
+} xfs_dir2_sf_entry_t;

 static inline int xfs_dir2_sf_hdr_size(int i8count)
 {
 	return sizeof(struct xfs_dir2_sf_hdr) -
-		(i8count == 0) *
-		(sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
+		(i8count == 0) * XFS_INO64_DIFF;
 }

 static inline xfs_dir2_data_aoff_t
 xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
 {
-	return get_unaligned_be16(&sfep->offset.i);
+	return get_unaligned_be16(sfep->offset);
 }

 static inline void
 xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
 {
-	put_unaligned_be16(off, &sfep->offset.i);
+	put_unaligned_be16(off, sfep->offset);
 }

 static inline struct xfs_dir2_sf_entry *
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
 		/*
 		 * Calculate the new size, see if we should give up yet.
 		 */
-		size = xfs_dir2_sf_hdr_size(i8count) +		/* header */
-		       count +					/* namelen */
-		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
-		       namelen +				/* name */
-		       (i8count ?				/* inumber */
-				(uint)sizeof(xfs_dir2_ino8_t) * count :
-				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		size = xfs_dir2_sf_hdr_size(i8count) +	/* header */
+		       count * 3 * sizeof(u8) +		/* namelen + offset */
+		       namelen +			/* name */
+		       (i8count ?			/* inumber */
+				count * XFS_INO64_SIZE :
+				count * XFS_INO32_SIZE);
 		if (size > XFS_IFORK_DSIZE(dp))
 			return size;		/* size value is a failure */
 	}
@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
 		/*
 		 * Yes, adjust the inode size.  old count + (parent + new)
 		 */
-		incr_isize +=
-			(sfp->count + 2) *
-			((uint)sizeof(xfs_dir2_ino8_t) -
-			 (uint)sizeof(xfs_dir2_ino4_t));
+		incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
 		objchange = 1;
 	}

@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
 		int	error;			/* error return value */
 		int	newsize;		/* new inode size */

-		newsize =
-			dp->i_df.if_bytes +
-			(sfp->count + 1) *
-			((uint)sizeof(xfs_dir2_ino8_t) -
-			 (uint)sizeof(xfs_dir2_ino4_t));
+		newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
 		/*
 		 * Won't fit as shortform, convert to block then do replace.
 		 */
@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
 	/*
 	 * Compute the new inode size.
 	 */
-	newsize =
-		oldsize -
-		(oldsfp->count + 1) *
-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
 	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
 	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
 	/*
@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
 		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
 		sfep->namelen = oldsfep->namelen;
-		sfep->offset = oldsfep->offset;
+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
 		dp->d_ops->sf_put_ino(sfp, sfep,
 				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
 	/*
 	 * Compute the new inode size (nb: entry count + 1 for parent)
 	 */
-	newsize =
-		oldsize +
-		(oldsfp->count + 1) *
-		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
 	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
 	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
 	/*
@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
 	     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
 		  oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
 		sfep->namelen = oldsfep->namelen;
-		sfep->offset = oldsfep->offset;
+		memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
 		memcpy(sfep->name, oldsfep->name, sfep->namelen);
 		dp->d_ops->sf_put_ino(sfp, sfep,
 				      dp->d_ops->sf_get_ino(oldsfp, oldsfep));
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 * with the crc feature bit, and all accesses to them must be conditional on
 * that flag.
 */
+/* short form block header */
+struct xfs_btree_block_shdr {
+	__be32		bb_leftsib;
+	__be32		bb_rightsib;
+
+	__be64		bb_blkno;
+	__be64		bb_lsn;
+	uuid_t		bb_uuid;
+	__be32		bb_owner;
+	__le32		bb_crc;
+};
+
+/* long form block header */
+struct xfs_btree_block_lhdr {
+	__be64		bb_leftsib;
+	__be64		bb_rightsib;
+
+	__be64		bb_blkno;
+	__be64		bb_lsn;
+	uuid_t		bb_uuid;
+	__be64		bb_owner;
+	__le32		bb_crc;
+	__be32		bb_pad; /* padding for alignment */
+};
+
 struct xfs_btree_block {
 	__be32		bb_magic;	/* magic number for block type */
 	__be16		bb_level;	/* 0 is a leaf */
 	__be16		bb_numrecs;	/* current # of data records */
 	union {
-		struct {
-			__be32		bb_leftsib;
-			__be32		bb_rightsib;
-
-			__be64		bb_blkno;
-			__be64		bb_lsn;
-			uuid_t		bb_uuid;
-			__be32		bb_owner;
-			__le32		bb_crc;
-		} s;			/* short form pointers */
-		struct	{
-			__be64		bb_leftsib;
-			__be64		bb_rightsib;
-
-			__be64		bb_blkno;
-			__be64		bb_lsn;
-			uuid_t		bb_uuid;
-			__be64		bb_owner;
-			__le32		bb_crc;
-			__be32		bb_pad; /* padding for alignment */
-		} l;			/* long form pointers */
+		struct xfs_btree_block_shdr s;
+		struct xfs_btree_block_lhdr l;
 	} bb_u;				/* rest */
 };

-#define XFS_BTREE_SBLOCK_LEN	16	/* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN	24	/* size of a long form block */
+/* size of a short form block */
+#define XFS_BTREE_SBLOCK_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 offsetof(struct xfs_btree_block_shdr, bb_blkno))
+/* size of a long form block */
+#define XFS_BTREE_LBLOCK_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 offsetof(struct xfs_btree_block_lhdr, bb_blkno))

 /* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN	(XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN	(XFS_BTREE_LBLOCK_LEN + 48)
+#define XFS_BTREE_SBLOCK_CRC_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 sizeof(struct xfs_btree_block_shdr))
+#define XFS_BTREE_LBLOCK_CRC_LEN \
+	(offsetof(struct xfs_btree_block, bb_u) + \
+	 sizeof(struct xfs_btree_block_lhdr))

 #define XFS_BTREE_SBLOCK_CRC_OFF \
 	offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@ -521,12 +521,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
 /*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */

-/*	XFS_IOC_FREEZE		  -- FIFREEZE   119	 */
-/*	XFS_IOC_THAW		  -- FITHAW     120	 */
-#ifndef FIFREEZE
-#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
-#define XFS_IOC_THAW		     _IOWR('X', 120, int)
-#endif
+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)	/* aka FIFREEZE */
+#define XFS_IOC_THAW		     _IOWR('X', 120, int)	/* aka FITHAW */

 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(

 	if (!xfs_inobt_issparse(rec->ir_holemask)) {
 		/* not sparse, calculate extent info directly */
-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
-				  XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)),
-				  mp->m_ialloc_blks, flist, mp);
+		xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
+				  mp->m_ialloc_blks);
 		return;
 	}

@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(

 		ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
 		ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
-		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
-				  flist, mp);
+		xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
+				  contigblk);

 		/* reset range to current bit and carry on... */
 		startidx = endidx = nextbit;
@ -2395,20 +2394,11 @@ void
 xfs_ialloc_compute_maxlevels(
 	xfs_mount_t	*mp)		/* file system mount structure */
 {
-	int		level;
-	uint		maxblocks;
-	uint		maxleafents;
-	int		minleafrecs;
-	int		minnoderecs;
+	uint		inodes;

-	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
-		XFS_INODES_PER_CHUNK_LOG;
-	minleafrecs = mp->m_inobt_mnr[0];
-	minnoderecs = mp->m_inobt_mnr[1];
-	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
-	for (level = 1; maxblocks > 1; level++)
-		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
-	mp->m_in_maxlevels = level;
+	inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+	mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
+							 inodes);
 }

 /*
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
 * Get a buffer for the bitmap or summary file block specified.
 * The buffer is returned read and locked.
 */
-int
+static int
 xfs_rtbuf_get(
 	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
 */
 static void
 xfs_finish_page_writeback(
@ -95,8 +101,9 @@ xfs_finish_page_writeback(
 	int			error)
 {
 	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
-	struct buffer_head	*head, *bh;
+	struct buffer_head	*head, *bh, *next;
 	unsigned int		off = 0;
+	unsigned int		bsize;

 	ASSERT(bvec->bv_offset < PAGE_SIZE);
 	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@ -105,15 +112,17 @@ xfs_finish_page_writeback(

 	bh = head = page_buffers(bvec->bv_page);

+	bsize = bh->b_size;
 	do {
+		next = bh->b_this_page;
 		if (off < bvec->bv_offset)
 			goto next_bh;
 		if (off > end)
 			break;
 		bh->b_end_io(bh, !error);
 next_bh:
-		off += bh->b_size;
-	} while ((bh = bh->b_this_page) != head);
+		off += bsize;
+	} while ((bh = next) != head);
 }

 /*
@ -1041,6 +1050,20 @@ xfs_vm_releasepage(

 	trace_xfs_releasepage(page->mapping->host, page, 0, 0);

+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(). Conversely,
+	 * block_invalidatepage() can send pages that are still marked dirty
+	 * but otherwise have invalidated buffers.
+	 *
+	 * We've historically freed buffers on the latter. Instead, quietly
+	 * filter out all dirty pages to avoid spurious buffer state warnings.
+	 * This can likely be removed once shrink_active_list() is fixed.
+	 */
+	if (PageDirty(page))
+		return 0;
+
 	xfs_count_page_state(page, &delalloc, &unwritten);

 	if (WARN_ON_ONCE(delalloc))
@ -1144,6 +1167,8 @@ __xfs_get_blocks(
 	ssize_t			size;
 	int			new = 0;

+	BUG_ON(create && !direct);
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;

@ -1151,22 +1176,14 @@ __xfs_get_blocks(
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;

-	if (!create && direct && offset >= i_size_read(inode))
+	if (!create && offset >= i_size_read(inode))
 		return 0;

 	/*
 	 * Direct I/O is usually done on preallocated files, so try getting
-	 * a block mapping without an exclusive lock first.  For buffered
-	 * writes we already have the exclusive iolock anyway, so avoiding
-	 * a lock roundtrip here by taking the ilock exclusive from the
-	 * beginning is a useful micro optimization.
+	 * a block mapping without an exclusive lock first.
 	 */
-	if (create && !direct) {
-		lockmode = XFS_ILOCK_EXCL;
-		xfs_ilock(ip, lockmode);
-	} else {
-		lockmode = xfs_ilock_data_map_shared(ip);
-	}
+	lockmode = xfs_ilock_data_map_shared(ip);

 	ASSERT(offset <= mp->m_super->s_maxbytes);
 	if (offset + size > mp->m_super->s_maxbytes)
@ -1185,37 +1202,19 @@ __xfs_get_blocks(
 	     (imap.br_startblock == HOLESTARTBLOCK ||
 	      imap.br_startblock == DELAYSTARTBLOCK) ||
 	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-		if (direct || xfs_get_extsz_hint(ip)) {
-			/*
-			 * xfs_iomap_write_direct() expects the shared lock. It
-			 * is unlocked on return.
-			 */
-			if (lockmode == XFS_ILOCK_EXCL)
-				xfs_ilock_demote(ip, lockmode);
+		/*
+		 * xfs_iomap_write_direct() expects the shared lock. It
+		 * is unlocked on return.
+		 */
+		if (lockmode == XFS_ILOCK_EXCL)
+			xfs_ilock_demote(ip, lockmode);

-			error = xfs_iomap_write_direct(ip, offset, size,
-						       &imap, nimaps);
-			if (error)
-				return error;
-			new = 1;
+		error = xfs_iomap_write_direct(ip, offset, size,
+					       &imap, nimaps);
+		if (error)
+			return error;
+		new = 1;

-		} else {
-			/*
-			 * Delalloc reservations do not require a transaction,
-			 * we can go on without dropping the lock here. If we
-			 * are allocating a new delalloc block, make sure that
-			 * we set the new flag so that we mark the buffer new so
-			 * that we know that it is newly allocated if the write
-			 * fails.
-			 */
-			if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
-				new = 1;
-			error = xfs_iomap_write_delay(ip, offset, size, &imap);
-			if (error)
-				goto out_unlock;
-
-			xfs_iunlock(ip, lockmode);
-		}
 		trace_xfs_get_blocks_alloc(ip, offset, size,
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 						   : XFS_IO_DELALLOC, &imap);
@ -1236,9 +1235,7 @@ __xfs_get_blocks(
 	}

 	/* trim mapping down to size requested */
-	if (direct || size > (1 << inode->i_blkbits))
-		xfs_map_trim_size(inode, iblock, bh_result,
-				  &imap, offset, size);
+	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);

 	/*
 	 * For unwritten extents do not report a disk address in the buffered
@ -1251,7 +1248,7 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create && direct) {
+		if (create) {
 			if (dax_fault)
 				ASSERT(!ISUNWRITTEN(&imap));
 			else
@ -1280,14 +1277,7 @@ __xfs_get_blocks(
 	     (new || ISUNWRITTEN(&imap))))
 		set_buffer_new(bh_result);

-	if (imap.br_startblock == DELAYSTARTBLOCK) {
-		BUG_ON(direct);
-		if (create) {
-			set_buffer_uptodate(bh_result);
-			set_buffer_mapped(bh_result);
-			set_buffer_delay(bh_result);
-		}
-	}
+	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);

 	return 0;

@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
 * whereas if we have flags set we will always be called in task context
 * (i.e. from a workqueue).
 */
-STATIC int
+int
 xfs_end_io_direct_write(
 	struct kiocb		*iocb,
 	loff_t			offset,
@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
 	struct kiocb		*iocb,
 	struct iov_iter		*iter)
 {
-	struct inode		*inode = iocb->ki_filp->f_mapping->host;
-	dio_iodone_t		*endio = NULL;
-	int			flags = 0;
-	struct block_device	*bdev;
-
-	if (iov_iter_rw(iter) == WRITE) {
-		endio = xfs_end_io_direct_write;
-		flags = DIO_ASYNC_EXTEND;
-	}
-
-	if (IS_DAX(inode)) {
-		return dax_do_io(iocb, inode, iter,
-				 xfs_get_blocks_direct, endio, 0);
-	}
-
-	bdev = xfs_find_bdev_for_inode(inode);
-	return  __blockdev_direct_IO(iocb, inode, bdev, iter,
-			xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-/*
- * Punch out the delalloc blocks we have already allocated.
- *
- * Don't bother with xfs_setattr given that nothing can have made it to disk yet
- * as the page is still locked at this point.
- */
-STATIC void
-xfs_vm_kill_delalloc_range(
-	struct inode		*inode,
-	loff_t			start,
-	loff_t			end)
-{
-	struct xfs_inode	*ip = XFS_I(inode);
-	xfs_fileoff_t		start_fsb;
-	xfs_fileoff_t		end_fsb;
-	int			error;
-
-	start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
-	end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
-	if (end_fsb <= start_fsb)
-		return;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-						end_fsb - start_fsb);
-	if (error) {
-		/* something screwed, just bail */
-		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-			xfs_alert(ip->i_mount,
-		"xfs_vm_write_failed: unable to clean up ino %lld",
-					ip->i_ino);
-		}
-	}
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-STATIC void
-xfs_vm_write_failed(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			pos,
-	unsigned		len)
-{
-	loff_t			block_offset;
-	loff_t			block_start;
-	loff_t			block_end;
-	loff_t			from = pos & (PAGE_SIZE - 1);
-	loff_t			to = from + len;
-	struct buffer_head	*bh, *head;
-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
-
 	/*
-	 * The request pos offset might be 32 or 64 bit, this is all fine
-	 * on 64-bit platform.  However, for 64-bit pos request on 32-bit
-	 * platform, the high 32-bit will be masked off if we evaluate the
-	 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
-	 * 0xfffff000 as an unsigned long, hence the result is incorrect
-	 * which could cause the following ASSERT failed in most cases.
-	 * In order to avoid this, we can evaluate the block_offset of the
-	 * start of the page by using shifts rather than masks the mismatch
-	 * problem.
+	 * We just need the method present so that open/fcntl allow direct I/O.
 	 */
-	block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
-
-	ASSERT(block_offset + from == pos);
-
-	head = page_buffers(page);
-	block_start = 0;
-	for (bh = head; bh != head || !block_start;
-	     bh = bh->b_this_page, block_start = block_end,
-				   block_offset += bh->b_size) {
-		block_end = block_start + bh->b_size;
-
-		/* skip buffers before the write */
-		if (block_end <= from)
-			continue;
-
-		/* if the buffer is after the write, we're done */
-		if (block_start >= to)
-			break;
-
-		/*
-		 * Process delalloc and unwritten buffers beyond EOF. We can
-		 * encounter unwritten buffers in the event that a file has
-		 * post-EOF unwritten extents and an extending write happens to
-		 * fail (e.g., an unaligned write that also involves a delalloc
-		 * to the same page).
-		 */
-		if (!buffer_delay(bh) && !buffer_unwritten(bh))
-			continue;
-
-		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
-		    block_offset < i_size_read(inode))
-			continue;
-
-		if (buffer_delay(bh))
-			xfs_vm_kill_delalloc_range(inode, block_offset,
-						   block_offset + bh->b_size);
-
-		/*
-		 * This buffer does not contain data anymore. make sure anyone
-		 * who finds it knows that for certain.
-		 */
-		clear_buffer_delay(bh);
-		clear_buffer_uptodate(bh);
-		clear_buffer_mapped(bh);
-		clear_buffer_new(bh);
-		clear_buffer_dirty(bh);
-		clear_buffer_unwritten(bh);
-	}
-
-}
-
-/*
- * This used to call block_write_begin(), but it unlocks and releases the page
- * on error, and we need that page to be able to punch stale delalloc blocks out
- * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
- * the appropriate point.
- */
-STATIC int
-xfs_vm_write_begin(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		flags,
-	struct page		**pagep,
-	void			**fsdata)
-{
-	pgoff_t			index = pos >> PAGE_SHIFT;
-	struct page		*page;
-	int			status;
-	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
-
-	ASSERT(len <= PAGE_SIZE);
-
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-
-	status = __block_write_begin(page, pos, len, xfs_get_blocks);
-	if (xfs_mp_fail_writes(mp))
-		status = -EIO;
-	if (unlikely(status)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-
-		xfs_vm_write_failed(inode, page, pos, len);
-		unlock_page(page);
-
-		/*
-		 * If the write is beyond EOF, we only want to kill blocks
-		 * allocated in this write, not blocks that were previously
-		 * written successfully.
-		 */
-		if (xfs_mp_fail_writes(mp))
-			isize = 0;
-		if (pos + len > isize) {
-			ssize_t start = max_t(ssize_t, pos, isize);
-
-			truncate_pagecache_range(inode, start, pos + len);
-		}
-
-		put_page(page);
-		page = NULL;
-	}
-
-	*pagep = page;
-	return status;
-}
-
-/*
- * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * this specific write because they will never be written. Previous writes
- * beyond EOF where block allocation succeeded do not need to be trashed, so
- * only new blocks from this write should be trashed. For blocks within
- * EOF, generic_write_end() zeros them so they are safe to leave alone and be
- * written with all the other valid data.
- */
-STATIC int
-xfs_vm_write_end(
-	struct file		*file,
-	struct address_space	*mapping,
-	loff_t			pos,
-	unsigned		len,
-	unsigned		copied,
-	struct page		*page,
-	void			*fsdata)
-{
-	int			ret;
-
-	ASSERT(len <= PAGE_SIZE);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (unlikely(ret < len)) {
-		struct inode	*inode = mapping->host;
-		size_t		isize = i_size_read(inode);
-		loff_t		to = pos + len;
-
-		if (to > isize) {
-			/* only kill blocks in this write beyond EOF */
-			if (pos > isize)
-				isize = pos;
-			xfs_vm_kill_delalloc_range(inode, isize, to);
-			truncate_pagecache_range(inode, isize, to);
-		}
-	}
-	return ret;
+	return -EINVAL;
 }

 STATIC sector_t
@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.set_page_dirty		= xfs_vm_set_page_dirty,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
-	.write_begin		= xfs_vm_write_begin,
-	.write_end		= xfs_vm_write_end,
 	.bmap			= xfs_vm_bmap,
 	.direct_IO		= xfs_vm_direct_IO,
 	.migratepage		= buffer_migrate_page,
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@ -60,6 +60,9 @@ int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
 int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 			         struct buffer_head *map_bh, int create);

+int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
+		ssize_t size, void *private);
+
 extern void xfs_count_page_state(struct page *, int *, int *);
 extern struct block_device *xfs_find_bdev_for_inode(struct inode *);

--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
 * Recurse (gasp!) through the attribute nodes until we find leaves.
 * We're doing a depth-first traversal in order to invalidate everything.
 */
-int
+static int
 xfs_attr3_root_inactive(
 	struct xfs_trans	**trans,
 	struct xfs_inode	*dp)
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
 * we have to calculate each entries' hashvalue and sort them before
 * we can begin returning them to the user.
 */
-int
+static int
 xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 {
 	attrlist_cursor_kern_t *cursor;
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@ -79,6 +79,23 @@ xfs_zero_extent(
 		GFP_NOFS, true);
 }

+/* Sort bmap items by AG. */
+static int
+xfs_bmap_free_list_cmp(
+	void			*priv,
+	struct list_head	*a,
+	struct list_head	*b)
+{
+	struct xfs_mount	*mp = priv;
+	struct xfs_bmap_free_item	*ra;
+	struct xfs_bmap_free_item	*rb;
+
+	ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
+	rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
+}
+
 /*
 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
 * caller.  Frees all the extents that need freeing, which must be done
@ -99,14 +116,15 @@ xfs_bmap_finish(
 	int				error;	/* error return value */
 	int				committed;/* xact committed or not */
 	struct xfs_bmap_free_item	*free;	/* free extent item */
-	struct xfs_bmap_free_item	*next;	/* next item on free list */

 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
 	if (flist->xbf_count == 0)
 		return 0;

+	list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
+
 	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
-	for (free = flist->xbf_first; free; free = free->xbfi_next)
+	list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
 		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
 			free->xbfi_blockcount);

@ -125,9 +143,7 @@ xfs_bmap_finish(
 		if (committed) {
 			xfs_efi_release(efi);
 			xfs_force_shutdown((*tp)->t_mountp,
-				(error == -EFSCORRUPTED) ?
-					SHUTDOWN_CORRUPT_INCORE :
-					SHUTDOWN_META_IO_ERROR);
+					   SHUTDOWN_META_IO_ERROR);
 		}
 		return error;
 	}
@ -138,15 +154,15 @@ xfs_bmap_finish(
 	 * on error.
 	 */
 	efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
-	for (free = flist->xbf_first; free != NULL; free = next) {
-		next = free->xbfi_next;
-
+	while (!list_empty(&flist->xbf_flist)) {
+		free = list_first_entry(&flist->xbf_flist,
+				struct xfs_bmap_free_item, xbfi_list);
 		error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
 					      free->xbfi_blockcount);
 		if (error)
 			return error;

-		xfs_bmap_del_free(flist, NULL, free);
+		xfs_bmap_del_free(flist, free);
 	}

 	return 0;
@ -409,7 +425,7 @@ xfs_bmap_count_tree(
 /*
 * Count fsblocks of the given fork.
 */
-int						/* error */
+static int					/* error */
 xfs_bmap_count_blocks(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	xfs_inode_t		*ip,		/* incore inode */
@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
 		if (error)
 			break;

-		ASSERT(!flist.xbf_count && !flist.xbf_first);
+		ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
 next_block:
 		start_fsb++;
 		remaining--;
@ -1089,99 +1105,120 @@ error1:	/* Just cancel transaction */
 	return error;
 }

-/*
- * Zero file bytes between startoff and endoff inclusive.
- * The iolock is held exclusive and no blocks are buffered.
- *
- * This function is used by xfs_free_file_space() to zero
- * partial blocks when the range to free is not block aligned.
- * When unreserving space with boundaries that are not block
- * aligned we round up the start and round down the end
- * boundaries and then use this function to zero the parts of
- * the blocks that got dropped during the rounding.
- */
-STATIC int
-xfs_zero_remaining_bytes(
-	xfs_inode_t		*ip,
-	xfs_off_t		startoff,
-	xfs_off_t		endoff)
+static int
+xfs_unmap_extent(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		startoffset_fsb,
+	xfs_filblks_t		len_fsb,
+	int			*done)
 {
-	xfs_bmbt_irec_t		imap;
-	xfs_fileoff_t		offset_fsb;
-	xfs_off_t		lastoffset;
-	xfs_off_t		offset;
-	xfs_buf_t		*bp;
-	xfs_mount_t		*mp = ip->i_mount;
-	int			nimap;
-	int			error = 0;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bmap_free	free_list;
+	xfs_fsblock_t		firstfsb;
+	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	int			error;

-	/*
-	 * Avoid doing I/O beyond eof - it's not necessary
-	 * since nothing can read beyond eof.  The space will
-	 * be zeroed when the file is extended anyway.
-	 */
-	if (startoff >= XFS_ISIZE(ip))
-		return 0;
-
-	if (endoff > XFS_ISIZE(ip))
-		endoff = XFS_ISIZE(ip);
-
-	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
-		uint lock_mode;
-
-		offset_fsb = XFS_B_TO_FSBT(mp, offset);
-		nimap = 1;
-
-		lock_mode = xfs_ilock_data_map_shared(ip);
-		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
-		xfs_iunlock(ip, lock_mode);
-
-		if (error || nimap < 1)
-			break;
-		ASSERT(imap.br_blockcount >= 1);
-		ASSERT(imap.br_startoff == offset_fsb);
-		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-
-		if (imap.br_startblock == HOLESTARTBLOCK ||
-		    imap.br_state == XFS_EXT_UNWRITTEN) {
-			/* skip the entire extent */
-			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
-						      imap.br_blockcount) - 1;
-			continue;
-		}
-
-		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
-		if (lastoffset > endoff)
-			lastoffset = endoff;
-
-		/* DAX can just zero the backing device directly */
-		if (IS_DAX(VFS_I(ip))) {
-			error = dax_zero_page_range(VFS_I(ip), offset,
-						    lastoffset - offset + 1,
-						    xfs_get_blocks_direct);
-			if (error)
-				return error;
-			continue;
-		}
-
-		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp,
-				xfs_fsb_to_db(ip, imap.br_startblock),
-				BTOBB(mp->m_sb.sb_blocksize),
-				0, &bp, NULL);
-		if (error)
-			return error;
-
-		memset(bp->b_addr +
-				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
-		       0, lastoffset - offset + 1);
-
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-		if (error)
-			return error;
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	if (error) {
+		ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+		return error;
 	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
+			ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto out_trans_cancel;
+
+	xfs_trans_ijoin(tp, ip, 0);
+
+	xfs_bmap_init(&free_list, &firstfsb);
+	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
+			&free_list, done);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
+
+out_bmap_cancel:
+	xfs_bmap_cancel(&free_list);
+out_trans_cancel:
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+static int
+xfs_adjust_extent_unmap_boundaries(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*startoffset_fsb,
+	xfs_fileoff_t		*endoffset_fsb)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	int			nimap, error;
+	xfs_extlen_t		mod = 0;
+
+	nimap = 1;
+	error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
+	if (error)
+		return error;
+
+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+		xfs_daddr_t	block;
+
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		block = imap.br_startblock;
+		mod = do_div(block, mp->m_sb.sb_rextsize);
+		if (mod)
+			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+	}
+
+	nimap = 1;
+	error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
+	if (error)
+		return error;
+
+	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		mod++;
+		if (mod && mod != mp->m_sb.sb_rextsize)
+			*endoffset_fsb -= mod;
+	}
+
+	return 0;
+}
+
+static int
+xfs_flush_unmap_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct inode		*inode = VFS_I(ip);
+	xfs_off_t		rounding, start, end;
+	int			error;
+
+	/* wait for the completion of any pending DIOs */
+	inode_dio_wait(inode);
+
+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
+	start = round_down(offset, rounding);
+	end = round_up(offset + len, rounding) - 1;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return error;
+	truncate_pagecache_range(inode, start, end);
+	return 0;
 }

 int
@ -1190,24 +1227,10 @@ xfs_free_file_space(
 	xfs_off_t		offset,
 	xfs_off_t		len)
 {
-	int			done;
-	xfs_fileoff_t		endoffset_fsb;
-	int			error;
-	xfs_fsblock_t		firstfsb;
-	xfs_bmap_free_t		free_list;
-	xfs_bmbt_irec_t		imap;
-	xfs_off_t		ioffset;
-	xfs_off_t		iendoffset;
-	xfs_extlen_t		mod=0;
-	xfs_mount_t		*mp;
-	int			nimap;
-	uint			resblks;
-	xfs_off_t		rounding;
-	int			rt;
+	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		startoffset_fsb;
-	xfs_trans_t		*tp;
-
-	mp = ip->i_mount;
+	xfs_fileoff_t		endoffset_fsb;
+	int			done = 0, error;

 	trace_xfs_free_file_space(ip);

@ -1215,135 +1238,45 @@ xfs_free_file_space(
 	if (error)
 		return error;

-	error = 0;
 	if (len <= 0)	/* if nothing being freed */
+		return 0;
+
+	error = xfs_flush_unmap_range(ip, offset, len);
+	if (error)
 		return error;
-	rt = XFS_IS_REALTIME_INODE(ip);
-	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
+
+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);

-	/* wait for the completion of any pending DIOs */
-	inode_dio_wait(VFS_I(ip));
-
-	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
-	ioffset = round_down(offset, rounding);
-	iendoffset = round_up(offset + len, rounding) - 1;
-	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
-					     iendoffset);
-	if (error)
-		goto out;
-	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
-
 	/*
-	 * Need to zero the stuff we're not freeing, on disk.
-	 * If it's a realtime file & can't use unwritten extents then we
-	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
-	 * will take care of it for us.
+	 * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
+	 * and we can't use unwritten extents then we actually need to ensure
+	 * to zero the whole extent, otherwise we just need to take of block
+	 * boundaries, and xfs_bunmapi will handle the rest.
 	 */
-	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-		nimap = 1;
-		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
-					&imap, &nimap, 0);
+	if (XFS_IS_REALTIME_INODE(ip) &&
+	    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+		error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
+				&endoffset_fsb);
 		if (error)
-			goto out;
-		ASSERT(nimap == 0 || nimap == 1);
-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-			xfs_daddr_t	block;
-
-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-			block = imap.br_startblock;
-			mod = do_div(block, mp->m_sb.sb_rextsize);
-			if (mod)
-				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-		}
-		nimap = 1;
-		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
-					&imap, &nimap, 0);
-		if (error)
-			goto out;
-		ASSERT(nimap == 0 || nimap == 1);
-		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-			mod++;
-			if (mod && (mod != mp->m_sb.sb_rextsize))
-				endoffset_fsb -= mod;
-		}
+			return error;
 	}
-	if ((done = (endoffset_fsb <= startoffset_fsb)))
-		/*
-		 * One contiguous piece to clear
-		 */
-		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
-	else {
-		/*
-		 * Some full blocks, possibly two pieces to clear
-		 */
-		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
-			error = xfs_zero_remaining_bytes(ip, offset,
-				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
-		if (!error &&
-		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
-			error = xfs_zero_remaining_bytes(ip,
-				XFS_FSB_TO_B(mp, endoffset_fsb),
-				offset + len - 1);
+
+	if (endoffset_fsb > startoffset_fsb) {
+		while (!done) {
+			error = xfs_unmap_extent(ip, startoffset_fsb,
+					endoffset_fsb - startoffset_fsb, &done);
+			if (error)
+				return error;
+		}
 	}

 	/*
-	 * free file space until done or until there is an error
+	 * Now that we've unmap all full blocks we'll have to zero out any
+	 * partial block at the beginning and/or end.  xfs_zero_range is
+	 * smart enough to skip any holes, including those we just created.
 	 */
-	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
-	while (!error && !done) {
-
-		/*
-		 * allocate and setup the transaction. Allow this
-		 * transaction to dip into the reserve blocks to ensure
-		 * the freeing of the space succeeds at ENOSPC.
-		 */
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
-				&tp);
-		if (error) {
-			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
-			break;
-		}
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = xfs_trans_reserve_quota(tp, mp,
-				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
-				resblks, 0, XFS_QMOPT_RES_REGBLKS);
-		if (error)
-			goto error1;
-
-		xfs_trans_ijoin(tp, ip, 0);
-
-		/*
-		 * issue the bunmapi() call to free the blocks
-		 */
-		xfs_bmap_init(&free_list, &firstfsb);
-		error = xfs_bunmapi(tp, ip, startoffset_fsb,
-				  endoffset_fsb - startoffset_fsb,
-				  0, 2, &firstfsb, &free_list, &done);
-		if (error)
-			goto error0;
-
-		/*
-		 * complete the transaction
-		 */
-		error = xfs_bmap_finish(&tp, &free_list, NULL);
-		if (error)
-			goto error0;
-
-		error = xfs_trans_commit(tp);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	}
-
- out:
-	return error;
-
- error0:
-	xfs_bmap_cancel(&free_list);
- error1:
-	xfs_trans_cancel(tp);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	goto out;
+	return xfs_zero_range(ip, offset, len, NULL);
 }

 /*
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@ -31,8 +31,6 @@ struct xfs_bmalloca;
 int	xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int	xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
 		     int whichfork, int *eof);
-int	xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
-			      int whichfork, int *count);
 int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
 		xfs_fileoff_t start_fsb, xfs_fileoff_t length);

@ -43,7 +41,6 @@ int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,

 /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
 void	xfs_bmap_del_free(struct xfs_bmap_free *flist,
-			  struct xfs_bmap_free_item *prev,
 			  struct xfs_bmap_free_item *free);
 int	xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
 			       struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@ -79,6 +79,47 @@ xfs_buf_vmap_len(
 	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
 }

+/*
+ * Bump the I/O in flight count on the buftarg if we haven't yet done so for
+ * this buffer. The count is incremented once per buffer (per hold cycle)
+ * because the corresponding decrement is deferred to buffer release. Buffers
+ * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
+ * tracking adds unnecessary overhead. This is used for sychronization purposes
+ * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
+ * in-flight buffers.
+ *
+ * Buffers that are never released (e.g., superblock, iclog buffers) must set
+ * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
+ * never reaches zero and unmount hangs indefinitely.
+ */
+static inline void
+xfs_buf_ioacct_inc(
+	struct xfs_buf	*bp)
+{
+	if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
+		return;
+
+	ASSERT(bp->b_flags & XBF_ASYNC);
+	bp->b_flags |= _XBF_IN_FLIGHT;
+	percpu_counter_inc(&bp->b_target->bt_io_count);
+}
+
+/*
+ * Clear the in-flight state on a buffer about to be released to the LRU or
+ * freed and unaccount from the buftarg.
+ */
+static inline void
+xfs_buf_ioacct_dec(
+	struct xfs_buf	*bp)
+{
+	if (!(bp->b_flags & _XBF_IN_FLIGHT))
+		return;
+
+	ASSERT(bp->b_flags & XBF_ASYNC);
+	bp->b_flags &= ~_XBF_IN_FLIGHT;
+	percpu_counter_dec(&bp->b_target->bt_io_count);
+}
+
 /*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
@ -102,6 +143,14 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;

+	/*
+	 * Once the buffer is marked stale and unlocked, a subsequent lookup
+	 * could reset b_flags. There is no guarantee that the buffer is
+	 * unaccounted (released to LRU) before that occurs. Drop in-flight
+	 * status now to preserve accounting consistency.
+	 */
+	xfs_buf_ioacct_dec(bp);
+
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
 	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@ -815,7 +864,8 @@ xfs_buf_get_uncached(
 	struct xfs_buf		*bp;
 	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);

-	bp = _xfs_buf_alloc(target, &map, 1, 0);
+	/* flags might contain irrelevant bits, pass only what we care about */
+	bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
 	if (unlikely(bp == NULL))
 		goto fail;

@ -866,63 +916,85 @@ xfs_buf_hold(
 }

 /*
- *	Releases a hold on the specified buffer.  If the
- *	the hold count is 1, calls xfs_buf_free.
+ * Release a hold on the specified buffer. If the hold count is 1, the buffer is
+ * placed on LRU or freed (depending on b_lru_ref).
 */
 void
 xfs_buf_rele(
 	xfs_buf_t		*bp)
 {
 	struct xfs_perag	*pag = bp->b_pag;
+	bool			release;
+	bool			freebuf = false;

 	trace_xfs_buf_rele(bp, _RET_IP_);

 	if (!pag) {
 		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-		if (atomic_dec_and_test(&bp->b_hold))
+		if (atomic_dec_and_test(&bp->b_hold)) {
+			xfs_buf_ioacct_dec(bp);
 			xfs_buf_free(bp);
+		}
 		return;
 	}

 	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));

 	ASSERT(atomic_read(&bp->b_hold) > 0);
-	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-		spin_lock(&bp->b_lock);
-		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
-			/*
-			 * If the buffer is added to the LRU take a new
-			 * reference to the buffer for the LRU and clear the
-			 * (now stale) dispose list state flag
-			 */
-			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
-				bp->b_state &= ~XFS_BSTATE_DISPOSE;
-				atomic_inc(&bp->b_hold);
-			}
-			spin_unlock(&bp->b_lock);
-			spin_unlock(&pag->pag_buf_lock);
-		} else {
-			/*
-			 * most of the time buffers will already be removed from
-			 * the LRU, so optimise that case by checking for the
-			 * XFS_BSTATE_DISPOSE flag indicating the last list the
-			 * buffer was on was the disposal list
-			 */
-			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
-			} else {
-				ASSERT(list_empty(&bp->b_lru));
-			}
-			spin_unlock(&bp->b_lock);

-			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-			spin_unlock(&pag->pag_buf_lock);
-			xfs_perag_put(pag);
-			xfs_buf_free(bp);
-		}
+	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+	spin_lock(&bp->b_lock);
+	if (!release) {
+		/*
+		 * Drop the in-flight state if the buffer is already on the LRU
+		 * and it holds the only reference. This is racy because we
+		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
+		 * ensures the decrement occurs only once per-buf.
+		 */
+		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+			xfs_buf_ioacct_dec(bp);
+		goto out_unlock;
 	}
+
+	/* the last reference has been dropped ... */
+	xfs_buf_ioacct_dec(bp);
+	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+		/*
+		 * If the buffer is added to the LRU take a new reference to the
+		 * buffer for the LRU and clear the (now stale) dispose list
+		 * state flag
+		 */
+		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+			bp->b_state &= ~XFS_BSTATE_DISPOSE;
+			atomic_inc(&bp->b_hold);
+		}
+		spin_unlock(&pag->pag_buf_lock);
+	} else {
+		/*
+		 * most of the time buffers will already be removed from the
+		 * LRU, so optimise that case by checking for the
+		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
+		 * was on was the disposal list
+		 */
+		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+		} else {
+			ASSERT(list_empty(&bp->b_lru));
+		}
+
+		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+		spin_unlock(&pag->pag_buf_lock);
+		xfs_perag_put(pag);
+		freebuf = true;
+	}
+
+out_unlock:
+	spin_unlock(&bp->b_lock);
+
+	if (freebuf)
+		xfs_buf_free(bp);
 }


@ -944,10 +1016,12 @@ xfs_buf_trylock(
 	int			locked;

 	locked = down_trylock(&bp->b_sema) == 0;
-	if (locked)
+	if (locked) {
 		XB_SET_OWNER(bp);
-
-	trace_xfs_buf_trylock(bp, _RET_IP_);
+		trace_xfs_buf_trylock(bp, _RET_IP_);
+	} else {
+		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
+	}
 	return locked;
 }

@ -1339,6 +1413,7 @@ xfs_buf_submit(
 	 * xfs_buf_ioend too early.
 	 */
 	atomic_set(&bp->b_io_remaining, 1);
+	xfs_buf_ioacct_inc(bp);
 	_xfs_buf_ioapply(bp);

 	/*
@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
 	int loop = 0;

 	/*
-	 * We need to flush the buffer workqueue to ensure that all IO
-	 * completion processing is 100% done. Just waiting on buffer locks is
-	 * not sufficient for async IO as the reference count held over IO is
-	 * not released until after the buffer lock is dropped. Hence we need to
-	 * ensure here that all reference counts have been dropped before we
-	 * start walking the LRU list.
+	 * First wait on the buftarg I/O count for all in-flight buffers to be
+	 * released. This is critical as new buffers do not make the LRU until
+	 * they are released.
+	 *
+	 * Next, flush the buffer workqueue to ensure all completion processing
+	 * has finished. Just waiting on buffer locks is not sufficient for
+	 * async IO as the reference count held over IO is not released until
+	 * after the buffer lock is dropped. Hence we need to ensure here that
+	 * all reference counts have been dropped before we start walking the
+	 * LRU list.
 	 */
+	while (percpu_counter_sum(&btp->bt_io_count))
+		delay(100);
 	drain_workqueue(btp->bt_mount->m_buf_workqueue);

 	/* loop until there is nothing left on the lru list. */
@ -1627,6 +1708,8 @@ xfs_free_buftarg(
 	struct xfs_buftarg	*btp)
 {
 	unregister_shrinker(&btp->bt_shrinker);
+	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+	percpu_counter_destroy(&btp->bt_io_count);
 	list_lru_destroy(&btp->bt_lru);

 	if (mp->m_flags & XFS_MOUNT_BARRIER)
@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
 	if (list_lru_init(&btp->bt_lru))
 		goto error;

+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+		goto error;
+
 	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
 	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@ -1774,18 +1860,33 @@ xfs_buf_cmp(
 	return 0;
 }

+/*
+ * submit buffers for write.
+ *
+ * When we have a large buffer list, we do not want to hold all the buffers
+ * locked while we block on the request queue waiting for IO dispatch. To avoid
+ * this problem, we lock and submit buffers in groups of 50, thereby minimising
+ * the lock hold times for lists which may contain thousands of objects.
+ *
+ * To do this, we sort the buffer list before we walk the list to lock and
+ * submit buffers, and we plug and unplug around each group of buffers we
+ * submit.
+ */
 static int
-__xfs_buf_delwri_submit(
+xfs_buf_delwri_submit_buffers(
 	struct list_head	*buffer_list,
-	struct list_head	*io_list,
-	bool			wait)
+	struct list_head	*wait_list)
 {
-	struct blk_plug		plug;
 	struct xfs_buf		*bp, *n;
+	LIST_HEAD		(submit_list);
 	int			pinned = 0;
+	struct blk_plug		plug;

+	list_sort(NULL, buffer_list, xfs_buf_cmp);
+
+	blk_start_plug(&plug);
 	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
-		if (!wait) {
+		if (!wait_list) {
 			if (xfs_buf_ispinned(bp)) {
 				pinned++;
 				continue;
@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
 			continue;
 		}

-		list_move_tail(&bp->b_list, io_list);
 		trace_xfs_buf_delwri_split(bp, _RET_IP_);
-	}
-
-	list_sort(NULL, io_list, xfs_buf_cmp);
-
-	blk_start_plug(&plug);
-	list_for_each_entry_safe(bp, n, io_list, b_list) {
-		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
-		bp->b_flags |= XBF_WRITE | XBF_ASYNC;

 		/*
-		 * we do all Io submission async. This means if we need to wait
-		 * for IO completion we need to take an extra reference so the
-		 * buffer is still valid on the other side.
+		 * We do all IO submission async. This means if we need
+		 * to wait for IO completion we need to take an extra
+		 * reference so the buffer is still valid on the other
+		 * side. We need to move the buffer onto the io_list
+		 * at this point so the caller can still access it.
 		 */
-		if (wait)
+		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
+		if (wait_list) {
 			xfs_buf_hold(bp);
-		else
+			list_move_tail(&bp->b_list, wait_list);
+		} else
 			list_del_init(&bp->b_list);

 		xfs_buf_submit(bp);
@ -1849,8 +1946,7 @@ int
 xfs_buf_delwri_submit_nowait(
 	struct list_head	*buffer_list)
 {
-	LIST_HEAD		(io_list);
-	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
+	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
 }

 /*
@ -1865,15 +1961,15 @@ int
 xfs_buf_delwri_submit(
 	struct list_head	*buffer_list)
 {
-	LIST_HEAD		(io_list);
+	LIST_HEAD		(wait_list);
 	int			error = 0, error2;
 	struct xfs_buf		*bp;

-	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
+	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);

 	/* Wait for IO to complete. */
-	while (!list_empty(&io_list)) {
-		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
+	while (!list_empty(&wait_list)) {
+		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);

 		list_del_init(&bp->b_list);

--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@ -43,6 +43,7 @@ typedef enum {
 #define XBF_READ	 (1 << 0) /* buffer intended for reading from device */
 #define XBF_WRITE	 (1 << 1) /* buffer intended for writing to device */
 #define XBF_READ_AHEAD	 (1 << 2) /* asynchronous read-ahead */
+#define XBF_NO_IOACCT	 (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
 #define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
@ -62,6 +63,7 @@ typedef enum {
 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
 #define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
+#define _XBF_IN_FLIGHT	 (1 << 25) /* I/O in flight, for accounting purposes */

 typedef unsigned int xfs_buf_flags_t;

@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_COMPOUND,	"COMPOUND" }
+	{ _XBF_COMPOUND,	"COMPOUND" }, \
+	{ _XBF_IN_FLIGHT,	"IN_FLIGHT" }


 /*
@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
 	struct list_lru		bt_lru;
+
+	struct percpu_counter	bt_io_count;
 } xfs_buftarg_t;

 struct xfs_buf;
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@ -359,7 +359,7 @@ xfs_buf_item_format(
 	for (i = 0; i < bip->bli_format_count; i++) {
 		xfs_buf_item_format_segment(bip, lv, &vecp, offset,
 					    &bip->bli_formats[i]);
-		offset += bp->b_maps[i].bm_len;
+		offset += BBTOB(bp->b_maps[i].bm_len);
 	}

 	/*
@ -915,20 +915,28 @@ xfs_buf_item_log(
 	for (i = 0; i < bip->bli_format_count; i++) {
 		if (start > last)
 			break;
-		end = start + BBTOB(bp->b_maps[i].bm_len);
+		end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
+
+		/* skip to the map that includes the first byte to log */
 		if (first > end) {
 			start += BBTOB(bp->b_maps[i].bm_len);
 			continue;
 		}
+
+		/*
+		 * Trim the range to this segment and mark it in the bitmap.
+		 * Note that we must convert buffer offsets to segment relative
+		 * offsets (e.g., the first byte of each segment is byte 0 of
+		 * that segment).
+		 */
 		if (first < start)
 			first = start;
 		if (end > last)
 			end = last;
-
-		xfs_buf_item_log_segment(first, end,
+		xfs_buf_item_log_segment(first - start, end - start,
 					 &bip->bli_formats[i].blf_data_map[0]);

-		start += bp->b_maps[i].bm_len;
+		start += BBTOB(bp->b_maps[i].bm_len);
 	}
 }

@ -949,6 +957,7 @@ xfs_buf_item_free(
 	xfs_buf_log_item_t	*bip)
 {
 	xfs_buf_item_free_format(bip);
+	kmem_free(bip->bli_item.li_lv_shadow);
 	kmem_zone_free(xfs_buf_item_zone, bip);
 }

@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
 	trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
 	ASSERT(bp->b_iodone != NULL);

+	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
+
 	/*
 	 * If the write was asynchronous then no one will be looking for the
 	 * error.  If this is the first failure of this type, clear the error
@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
 	 * async write failure at least once, but we also need to set the buffer
 	 * up to behave correctly now for repeated failures.
 	 */
-	if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
+	if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
 	     bp->b_last_error != bp->b_error) {
-		bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
-			        XBF_DONE | XBF_WRITE_FAIL);
+		bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
 		bp->b_last_error = bp->b_error;
-		bp->b_retries = 0;
-		bp->b_first_retry_time = jiffies;
+		if (cfg->retry_timeout && !bp->b_first_retry_time)
+			bp->b_first_retry_time = jiffies;

 		xfs_buf_ioerror(bp, 0);
 		xfs_buf_submit(bp);
@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
 	 * Repeated failure on an async write. Take action according to the
 	 * error configuration we have been set up to use.
 	 */
-	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);

 	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
 	    ++bp->b_retries > cfg->max_retries)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
 {
 	ASSERT(list_empty(&dqp->q_lru));

+	kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
 	mutex_destroy(&dqp->q_qlock);

 	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
 	spin_lock(&ailp->xa_lock);
 	xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);

+	kmem_free(qfs->qql_item.li_lv_shadow);
+	kmem_free(lip->li_lv_shadow);
 	kmem_free(qfs);
 	kmem_free(qfe);
 	return (xfs_lsn_t)-1;
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 }

 int
-xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
 {
 	int i;
 	int len;
 	int64_t fsid;

+	if (error_tag >= XFS_ERRTAG_MAX)
+		return -EINVAL;
+
 	memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));

 	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
 	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
 			(rf))))

-extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp);
+extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
 extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #else
 #define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@ -40,6 +40,7 @@ void
 xfs_efi_item_free(
 	struct xfs_efi_log_item	*efip)
 {
+	kmem_free(efip->efi_item.li_lv_shadow);
 	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
 		kmem_free(efip);
 	else
@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
 {
+	kmem_free(efdp->efd_item.li_lv_shadow);
 	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
 		kmem_free(efdp);
 	else
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@ -37,6 +37,7 @@
 #include "xfs_log.h"
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"

 #include <linux/dcache.h>
 #include <linux/falloc.h>
@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
 }

 /*
- * xfs_iozero clears the specified range supplied via the page cache (except in
- * the DAX case). Writes through the page cache will allocate blocks over holes,
- * though the callers usually map the holes first and avoid them. If a block is
- * not completely zeroed, then it will be read from disk before being partially
- * zeroed.
- *
- * In the DAX case, we can just directly write to the underlying pages. This
- * will not allocate blocks, but will avoid holes and unwritten extents and so
- * not do unnecessary work.
+ * Clear the specified ranges to zero through either the pagecache or DAX.
+ * Holes and unwritten extents will be left as-is as they already are zeroed.
 */
 int
-xfs_iozero(
-	struct xfs_inode	*ip,	/* inode			*/
-	loff_t			pos,	/* offset in file		*/
-	size_t			count)	/* size of data to zero		*/
+xfs_zero_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		pos,
+	xfs_off_t		count,
+	bool			*did_zero)
 {
-	struct page		*page;
-	struct address_space	*mapping;
-	int			status = 0;
-
-
-	mapping = VFS_I(ip)->i_mapping;
-	do {
-		unsigned offset, bytes;
-		void *fsdata;
-
-		offset = (pos & (PAGE_SIZE -1)); /* Within page */
-		bytes = PAGE_SIZE - offset;
-		if (bytes > count)
-			bytes = count;
-
-		if (IS_DAX(VFS_I(ip))) {
-			status = dax_zero_page_range(VFS_I(ip), pos, bytes,
-						     xfs_get_blocks_direct);
-			if (status)
-				break;
-		} else {
-			status = pagecache_write_begin(NULL, mapping, pos, bytes,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
-			if (status)
-				break;
-
-			zero_user(page, offset, bytes);
-
-			status = pagecache_write_end(NULL, mapping, pos, bytes,
-						bytes, page, fsdata);
-			WARN_ON(status <= 0); /* can't return less than zero! */
-			status = 0;
-		}
-		pos += bytes;
-		count -= bytes;
-	} while (count);
-
-	return status;
+	return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
 }

 int
@ -282,49 +239,36 @@ xfs_file_fsync(
 }

 STATIC ssize_t
-xfs_file_read_iter(
+xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
-	struct file		*file = iocb->ki_filp;
-	struct inode		*inode = file->f_mapping->host;
+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
+	struct inode		*inode = mapping->host;
 	struct xfs_inode	*ip = XFS_I(inode);
-	struct xfs_mount	*mp = ip->i_mount;
-	size_t			size = iov_iter_count(to);
+	loff_t			isize = i_size_read(inode);
+	size_t			count = iov_iter_count(to);
+	struct iov_iter		data;
+	struct xfs_buftarg	*target;
 	ssize_t			ret = 0;
-	int			ioflags = 0;
-	xfs_fsize_t		n;
-	loff_t			pos = iocb->ki_pos;

-	XFS_STATS_INC(mp, xs_read_calls);
+	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);

-	if (unlikely(iocb->ki_flags & IOCB_DIRECT))
-		ioflags |= XFS_IO_ISDIRECT;
-	if (file->f_mode & FMODE_NOCMTIME)
-		ioflags |= XFS_IO_INVIS;
+	if (!count)
+		return 0; /* skip atime */

-	if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
-		xfs_buftarg_t	*target =
-			XFS_IS_REALTIME_INODE(ip) ?
-				mp->m_rtdev_targp : mp->m_ddev_targp;
-		/* DIO must be aligned to device logical sector size */
-		if ((pos | size) & target->bt_logical_sectormask) {
-			if (pos == i_size_read(inode))
-				return 0;
-			return -EINVAL;
-		}
+	if (XFS_IS_REALTIME_INODE(ip))
+		target = ip->i_mount->m_rtdev_targp;
+	else
+		target = ip->i_mount->m_ddev_targp;
+
+	/* DIO must be aligned to device logical sector size */
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
+		if (iocb->ki_pos == isize)
+			return 0;
+		return -EINVAL;
 	}

-	n = mp->m_super->s_maxbytes - pos;
-	if (n <= 0 || size == 0)
-		return 0;
-
-	if (n < size)
-		size = n;
-
-	if (XFS_FORCED_SHUTDOWN(mp))
-		return -EIO;
-
 	/*
 	 * Locking is a bit tricky here. If we take an exclusive lock for direct
 	 * IO, we effectively serialise all new concurrent read IO to this file
@ -336,7 +280,7 @@ xfs_file_read_iter(
 	 * serialisation.
 	 */
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+	if (mapping->nrpages) {
 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);

@ -351,8 +295,8 @@ xfs_file_read_iter(
 		 * flush and reduce the chances of repeated iolock cycles going
 		 * forward.
 		 */
-		if (inode->i_mapping->nrpages) {
-			ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+		if (mapping->nrpages) {
+			ret = filemap_write_and_wait(mapping);
 			if (ret) {
 				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
 				return ret;
@ -363,20 +307,95 @@ xfs_file_read_iter(
 			 * we fail to invalidate a page, but this should never
 			 * happen on XFS. Warn if it does fail.
 			 */
-			ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+			ret = invalidate_inode_pages2(mapping);
 			WARN_ON_ONCE(ret);
 			ret = 0;
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}

-	trace_xfs_file_read(ip, size, pos, ioflags);
+	data = *to;
+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+			xfs_get_blocks_direct, NULL, NULL, 0);
+	if (ret > 0) {
+		iocb->ki_pos += ret;
+		iov_iter_advance(to, ret);
+	}
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);

+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct iov_iter		data = *to;
+	size_t			count = iov_iter_count(to);
+	ssize_t			ret = 0;
+
+	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
+
+	if (!count)
+		return 0; /* skip atime */
+
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
+	if (ret > 0) {
+		iocb->ki_pos += ret;
+		iov_iter_advance(to, ret);
+	}
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_read(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	ssize_t			ret;
+
+	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
+
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = generic_file_read_iter(iocb, to);
+	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_read_iter(
+	struct kiocb		*iocb,
+	struct iov_iter		*to)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
+	ssize_t			ret = 0;
+
+	XFS_STATS_INC(mp, xs_read_calls);
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (IS_DAX(inode))
+		ret = xfs_file_dax_read(iocb, to);
+	else if (iocb->ki_flags & IOCB_DIRECT)
+		ret = xfs_file_dio_aio_read(iocb, to);
+	else
+		ret = xfs_file_buffered_aio_read(iocb, to);
+
 	if (ret > 0)
 		XFS_STATS_ADD(mp, xs_read_bytes, ret);
-
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }

@ -389,18 +408,14 @@ xfs_file_splice_read(
 	unsigned int		flags)
 {
 	struct xfs_inode	*ip = XFS_I(infilp->f_mapping->host);
-	int			ioflags = 0;
 	ssize_t			ret;

 	XFS_STATS_INC(ip->i_mount, xs_read_calls);

-	if (infilp->f_mode & FMODE_NOCMTIME)
-		ioflags |= XFS_IO_INVIS;
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;

-	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+	trace_xfs_file_splice_read(ip, count, *ppos);

 	/*
 	 * DAX inodes cannot ues the page cache for splice, so we have to push
@ -423,49 +438,6 @@ out:
 	return ret;
 }

-/*
- * This routine is called to handle zeroing any space in the last block of the
- * file that is beyond the EOF.  We do this since the size is being increased
- * without writing anything to that block and we don't want to read the
- * garbage on the disk.
- */
-STATIC int				/* error (positive) */
-xfs_zero_last_block(
-	struct xfs_inode	*ip,
-	xfs_fsize_t		offset,
-	xfs_fsize_t		isize,
-	bool			*did_zeroing)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		last_fsb = XFS_B_TO_FSBT(mp, isize);
-	int			zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-	int			zero_len;
-	int			nimaps = 1;
-	int			error = 0;
-	struct xfs_bmbt_irec	imap;
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	if (error)
-		return error;
-
-	ASSERT(nimaps > 0);
-
-	/*
-	 * If the block underlying isize is just a hole, then there
-	 * is nothing to zero.
-	 */
-	if (imap.br_startblock == HOLESTARTBLOCK)
-		return 0;
-
-	zero_len = mp->m_sb.sb_blocksize - zero_offset;
-	if (isize + zero_len > offset)
-		zero_len = offset - isize;
-	*did_zeroing = true;
-	return xfs_iozero(ip, isize, zero_len);
-}
-
 /*
 * Zero any on disk space between the current EOF and the new, larger EOF.
 *
@ -484,94 +456,11 @@ xfs_zero_eof(
 	xfs_fsize_t		isize,		/* current inode size */
 	bool			*did_zeroing)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		start_zero_fsb;
-	xfs_fileoff_t		end_zero_fsb;
-	xfs_fileoff_t		zero_count_fsb;
-	xfs_fileoff_t		last_fsb;
-	xfs_fileoff_t		zero_off;
-	xfs_fsize_t		zero_len;
-	int			nimaps;
-	int			error = 0;
-	struct xfs_bmbt_irec	imap;
-
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(offset > isize);

 	trace_xfs_zero_eof(ip, isize, offset - isize);
-
-	/*
-	 * First handle zeroing the block on which isize resides.
-	 *
-	 * We only zero a part of that block so it is handled specially.
-	 */
-	if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
-		error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
-		if (error)
-			return error;
-	}
-
-	/*
-	 * Calculate the range between the new size and the old where blocks
-	 * needing to be zeroed may exist.
-	 *
-	 * To get the block where the last byte in the file currently resides,
-	 * we need to subtract one from the size and truncate back to a block
-	 * boundary.  We subtract 1 in case the size is exactly on a block
-	 * boundary.
-	 */
-	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-	if (last_fsb == end_zero_fsb) {
-		/*
-		 * The size was only incremented on its last block.
-		 * We took care of that above, so just return.
-		 */
-		return 0;
-	}
-
-	ASSERT(start_zero_fsb <= end_zero_fsb);
-	while (start_zero_fsb <= end_zero_fsb) {
-		nimaps = 1;
-		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
-					  &imap, &nimaps, 0);
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		if (error)
-			return error;
-
-		ASSERT(nimaps > 0);
-
-		if (imap.br_state == XFS_EXT_UNWRITTEN ||
-		    imap.br_startblock == HOLESTARTBLOCK) {
-			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-			continue;
-		}
-
-		/*
-		 * There are blocks we need to zero.
-		 */
-		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-		if ((zero_off + zero_len) > offset)
-			zero_len = offset - zero_off;
-
-		error = xfs_iozero(ip, zero_off, zero_len);
-		if (error)
-			return error;
-
-		*did_zeroing = true;
-		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-	}
-
-	return 0;
+	return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
 }

 /*
@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
 					mp->m_rtdev_targp : mp->m_ddev_targp;

 	/* DIO must be aligned to device logical sector size */
-	if (!IS_DAX(inode) &&
-	    ((iocb->ki_pos | count) & target->bt_logical_sectormask))
+	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
 		return -EINVAL;

 	/* "unaligned" here means not aligned to a filesystem block */
@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
 	end = iocb->ki_pos + count - 1;

 	/*
-	 * See xfs_file_read_iter() for why we do a full-file flush here.
+	 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
 	 */
 	if (mapping->nrpages) {
 		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
 		iolock = XFS_IOLOCK_SHARED;
 	}

-	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);

 	data = *from;
-	ret = mapping->a_ops->direct_IO(iocb, &data);
+	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
+			xfs_get_blocks_direct, xfs_end_io_direct_write,
+			NULL, DIO_ASYNC_EXTEND);

 	/* see generic_file_direct_write() for why this is necessary */
 	if (mapping->nrpages) {
@ -809,10 +699,70 @@ out:
 	xfs_rw_iunlock(ip, iolock);

 	/*
-	 * No fallback to buffered IO on errors for XFS. DAX can result in
-	 * partial writes, but direct IO will either complete fully or fail.
+	 * No fallback to buffered IO on errors for XFS, direct IO will either
+	 * complete fully or fail.
 	 */
-	ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
+	ASSERT(ret < 0 || ret == count);
+	return ret;
+}
+
+static noinline ssize_t
+xfs_file_dax_write(
+	struct kiocb		*iocb,
+	struct iov_iter		*from)
+{
+	struct address_space	*mapping = iocb->ki_filp->f_mapping;
+	struct inode		*inode = mapping->host;
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	ssize_t			ret = 0;
+	int			unaligned_io = 0;
+	int			iolock;
+	struct iov_iter		data;
+
+	/* "unaligned" here means not aligned to a filesystem block */
+	if ((iocb->ki_pos & mp->m_blockmask) ||
+	    ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
+		unaligned_io = 1;
+		iolock = XFS_IOLOCK_EXCL;
+	} else if (mapping->nrpages) {
+		iolock = XFS_IOLOCK_EXCL;
+	} else {
+		iolock = XFS_IOLOCK_SHARED;
+	}
+	xfs_rw_ilock(ip, iolock);
+
+	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
+	if (ret)
+		goto out;
+
+	/*
+	 * Yes, even DAX files can have page cache attached to them:  A zeroed
+	 * page is inserted into the pagecache when we have to serve a write
+	 * fault on a hole.  It should never be dirtied and can simply be
+	 * dropped from the pagecache once we get real data for the page.
+	 */
+	if (mapping->nrpages) {
+		ret = invalidate_inode_pages2(mapping);
+		WARN_ON_ONCE(ret);
+	}
+
+	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
+		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		iolock = XFS_IOLOCK_SHARED;
+	}
+
+	trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
+
+	data = *from;
+	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
+			xfs_end_io_direct_write, 0);
+	if (ret > 0) {
+		iocb->ki_pos += ret;
+		iov_iter_advance(from, ret);
+	}
+out:
+	xfs_rw_iunlock(ip, iolock);
 	return ret;
 }

@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
 	current->backing_dev_info = inode_to_bdi(inode);

 write_retry:
-	trace_xfs_file_buffered_write(ip, iov_iter_count(from),
-				      iocb->ki_pos, 0);
-	ret = generic_perform_write(file, from, iocb->ki_pos);
+	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
+	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
 		iocb->ki_pos += ret;

@ -895,7 +844,9 @@ xfs_file_write_iter(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;

-	if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
+	if (IS_DAX(inode))
+		ret = xfs_file_dax_write(iocb, from);
+	else if (iocb->ki_flags & IOCB_DIRECT)
 		ret = xfs_file_dio_aio_write(iocb, from);
 	else
 		ret = xfs_file_buffered_aio_write(iocb, from);
@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
 	if (IS_DAX(inode)) {
 		ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
 	} else {
-		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
 	}

--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@ -667,8 +667,11 @@ xfs_reserve_blocks(
 	__uint64_t              *inval,
 	xfs_fsop_resblks_t      *outval)
 {
-	__int64_t		lcounter, delta, fdblks_delta;
+	__int64_t		lcounter, delta;
+	__int64_t		fdblks_delta = 0;
 	__uint64_t		request;
+	__int64_t		free;
+	int			error = 0;

 	/* If inval is null, report current values and return */
 	if (inval == (__uint64_t *)NULL) {
@ -682,24 +685,23 @@ xfs_reserve_blocks(
 	request = *inval;

 	/*
-	 * With per-cpu counters, this becomes an interesting
-	 * problem. we needto work out if we are freeing or allocation
-	 * blocks first, then we can do the modification as necessary.
+	 * With per-cpu counters, this becomes an interesting problem. we need
+	 * to work out if we are freeing or allocation blocks first, then we can
+	 * do the modification as necessary.
 	 *
-	 * We do this under the m_sb_lock so that if we are near
-	 * ENOSPC, we will hold out any changes while we work out
-	 * what to do. This means that the amount of free space can
-	 * change while we do this, so we need to retry if we end up
-	 * trying to reserve more space than is available.
+	 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
+	 * hold out any changes while we work out what to do. This means that
+	 * the amount of free space can change while we do this, so we need to
+	 * retry if we end up trying to reserve more space than is available.
 	 */
-retry:
 	spin_lock(&mp->m_sb_lock);

 	/*
 	 * If our previous reservation was larger than the current value,
-	 * then move any unused blocks back to the free pool.
+	 * then move any unused blocks back to the free pool. Modify the resblks
+	 * counters directly since we shouldn't have any problems unreserving
+	 * space.
 	 */
-	fdblks_delta = 0;
 	if (mp->m_resblks > request) {
 		lcounter = mp->m_resblks_avail - request;
 		if (lcounter  > 0) {		/* release unused blocks */
@ -707,54 +709,67 @@ retry:
 			mp->m_resblks_avail -= lcounter;
 		}
 		mp->m_resblks = request;
-	} else {
-		__int64_t	free;
+		if (fdblks_delta) {
+			spin_unlock(&mp->m_sb_lock);
+			error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
+			spin_lock(&mp->m_sb_lock);
+		}

+		goto out;
+	}
+
+	/*
+	 * If the request is larger than the current reservation, reserve the
+	 * blocks before we update the reserve counters. Sample m_fdblocks and
+	 * perform a partial reservation if the request exceeds free space.
+	 */
+	error = -ENOSPC;
+	do {
 		free = percpu_counter_sum(&mp->m_fdblocks) -
 							XFS_ALLOC_SET_ASIDE(mp);
 		if (!free)
-			goto out; /* ENOSPC and fdblks_delta = 0 */
+			break;

 		delta = request - mp->m_resblks;
 		lcounter = free - delta;
-		if (lcounter < 0) {
+		if (lcounter < 0)
 			/* We can't satisfy the request, just get what we can */
-			mp->m_resblks += free;
-			mp->m_resblks_avail += free;
-			fdblks_delta = -free;
-		} else {
-			fdblks_delta = -delta;
-			mp->m_resblks = request;
-			mp->m_resblks_avail += delta;
-		}
+			fdblks_delta = free;
+		else
+			fdblks_delta = delta;
+
+		/*
+		 * We'll either succeed in getting space from the free block
+		 * count or we'll get an ENOSPC. If we get a ENOSPC, it means
+		 * things changed while we were calculating fdblks_delta and so
+		 * we should try again to see if there is anything left to
+		 * reserve.
+		 *
+		 * Don't set the reserved flag here - we don't want to reserve
+		 * the extra reserve blocks from the reserve.....
+		 */
+		spin_unlock(&mp->m_sb_lock);
+		error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+		spin_lock(&mp->m_sb_lock);
+	} while (error == -ENOSPC);
+
+	/*
+	 * Update the reserve counters if blocks have been successfully
+	 * allocated.
+	 */
+	if (!error && fdblks_delta) {
+		mp->m_resblks += fdblks_delta;
+		mp->m_resblks_avail += fdblks_delta;
 	}
+
 out:
 	if (outval) {
 		outval->resblks = mp->m_resblks;
 		outval->resblks_avail = mp->m_resblks_avail;
 	}
-	spin_unlock(&mp->m_sb_lock);

-	if (fdblks_delta) {
-		/*
-		 * If we are putting blocks back here, m_resblks_avail is
-		 * already at its max so this will put it in the free pool.
-		 *
-		 * If we need space, we'll either succeed in getting it
-		 * from the free block count or we'll get an enospc. If
-		 * we get a ENOSPC, it means things changed while we were
-		 * calculating fdblks_delta and so we should try again to
-		 * see if there is anything left to reserve.
-		 *
-		 * Don't set the reserved flag here - we don't want to reserve
-		 * the extra reserve blocks from the reserve.....
-		 */
-		int error;
-		error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
-		if (error == -ENOSPC)
-			goto retry;
-	}
-	return 0;
+	spin_unlock(&mp->m_sb_lock);
+	return error;
 }

 int
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@ -765,7 +765,7 @@ restart:
 * Background scanning to trim post-EOF preallocated space. This is queued
 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
 */
-STATIC void
+void
 xfs_queue_eofblocks(
 	struct xfs_mount *mp)
 {
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
+void xfs_queue_eofblocks(struct xfs_mount *);

 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 * lock more than one at a time, lockdep will report false positives saying we
 * have violated locking orders.
 */
-void
+static void
 xfs_lock_inodes(
 	xfs_inode_t	**ips,
 	int		inodes,
@ -667,14 +667,6 @@ xfs_ip2xflags(
 	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }

-uint
-xfs_dic2xflags(
-	struct xfs_dinode	*dip)
-{
-	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
-				be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
-}
-
 /*
 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 * is allowed, otherwise it has to be an exact match. If a CI match is found,
@ -748,7 +740,7 @@ out_unlock:
 * are not linked into the directory structure - they are attached
 * directly to the superblock - and so have no parent.
 */
-int
+static int
 xfs_ialloc(
 	xfs_trans_t	*tp,
 	xfs_inode_t	*pip,
@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
 * link count to go to zero, move the inode to AGI unlinked list so that it can
 * be freed when the last active reference goes away via xfs_inactive().
 */
-int				/* error */
+static int			/* error */
 xfs_droplink(
 	xfs_trans_t *tp,
 	xfs_inode_t *ip)
@ -1104,7 +1096,7 @@ xfs_droplink(
 /*
 * Increment the link count on an inode & log the change.
 */
-int
+static int
 xfs_bumplink(
 	xfs_trans_t *tp,
 	xfs_inode_t *ip)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@ -395,12 +395,8 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
-int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
-			   xfs_nlink_t, xfs_dev_t, prid_t, int,
-			   struct xfs_buf **, xfs_inode_t **);

 uint		xfs_ip2xflags(struct xfs_inode *);
-uint		xfs_dic2xflags(struct xfs_dinode *);
 int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
 			   struct xfs_bmap_free *);
 int		xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@ -411,7 +407,6 @@ void		xfs_iunpin_wait(xfs_inode_t *);
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))

 int		xfs_iflush(struct xfs_inode *, struct xfs_buf **);
-void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);

 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
@ -419,8 +414,6 @@ xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
 			       xfs_nlink_t, xfs_dev_t, prid_t, int,
 			       struct xfs_inode **, int *);
-int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
-int		xfs_bumplink(struct xfs_trans *, struct xfs_inode *);

 /* from xfs_file.c */
 enum xfs_prealloc_flags {
@ -434,7 +427,8 @@ int	xfs_update_prealloc_flags(struct xfs_inode *ip,
 				  enum xfs_prealloc_flags flags);
 int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
 		     xfs_fsize_t isize, bool *did_zeroing);
-int	xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+int	xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
+		bool *did_zero);
 loff_t	__xfs_seek_hole_data(struct inode *inode, loff_t start,
 			     loff_t eof, int whence);

@ -479,14 +473,4 @@ do { \

 extern struct kmem_zone	*xfs_inode_zone;

-/*
- * Flags for read/write calls
- */
-#define XFS_IO_ISDIRECT	0x00001		/* bypass page cache */
-#define XFS_IO_INVIS	0x00002		/* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-	{ XFS_IO_ISDIRECT,	"DIRECT" }, \
-	{ XFS_IO_INVIS,		"INVIS"}
-
 #endif	/* __XFS_INODE_H__ */
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@ -651,6 +651,7 @@ void
 xfs_inode_item_destroy(
 	xfs_inode_t	*ip)
 {
+	kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
 	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }

--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(

 int
 xfs_ioc_space(
-	struct xfs_inode	*ip,
-	struct inode		*inode,
 	struct file		*filp,
-	int			ioflags,
 	unsigned int		cmd,
 	xfs_flock64_t		*bf)
 {
+	struct inode		*inode = file_inode(filp);
+	struct xfs_inode	*ip = XFS_I(inode);
 	struct iattr		iattr;
 	enum xfs_prealloc_flags	flags = 0;
 	uint			iolock = XFS_IOLOCK_EXCL;
@ -626,7 +625,7 @@ xfs_ioc_space(

 	if (filp->f_flags & O_DSYNC)
 		flags |= XFS_PREALLOC_SYNC;
-	if (ioflags & XFS_IO_INVIS)
+	if (filp->f_mode & FMODE_NOCMTIME)
 		flags |= XFS_PREALLOC_INVISIBLE;

 	error = mnt_want_write_file(filp);
@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)

 STATIC int
 xfs_ioc_getbmap(
-	struct xfs_inode	*ip,
-	int			ioflags,
+	struct file		*file,
 	unsigned int		cmd,
 	void			__user *arg)
 {
@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
 		return -EINVAL;

 	bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-	if (ioflags & XFS_IO_INVIS)
+	if (file->f_mode & FMODE_NOCMTIME)
 		bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;

-	error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+	error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
 			    (__force struct getbmap *)arg+1);
 	if (error)
 		return error;
@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
 		goto out_put_tmp_file;
 	}

+	/*
+	 * We need to ensure that the fds passed in point to XFS inodes
+	 * before we cast and access them as XFS structures as we have no
+	 * control over what the user passes us here.
+	 */
 	if (f.file->f_op != &xfs_file_operations ||
 	    tmp.file->f_op != &xfs_file_operations) {
 		error = -EINVAL;
@ -1625,12 +1628,8 @@ xfs_file_ioctl(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	void			__user *arg = (void __user *)p;
-	int			ioflags = 0;
 	int			error;

-	if (filp->f_mode & FMODE_NOCMTIME)
-		ioflags |= XFS_IO_INVIS;
-
 	trace_xfs_file_ioctl(ip);

 	switch (cmd) {
@ -1649,7 +1648,7 @@ xfs_file_ioctl(

 		if (copy_from_user(&bf, arg, sizeof(bf)))
 			return -EFAULT;
-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+		return xfs_ioc_space(filp, cmd, &bf);
 	}
 	case XFS_IOC_DIOINFO: {
 		struct dioattr	da;
@ -1708,7 +1707,7 @@ xfs_file_ioctl(

 	case XFS_IOC_GETBMAP:
 	case XFS_IOC_GETBMAPA:
-		return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+		return xfs_ioc_getbmap(filp, cmd, arg);

 	case XFS_IOC_GETBMAPX:
 		return xfs_ioc_getbmapx(ip, arg);
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@ -20,10 +20,7 @@

 extern int
 xfs_ioc_space(
-	struct xfs_inode	*ip,
-	struct inode		*inode,
 	struct file		*filp,
-	int			ioflags,
 	unsigned int		cmd,
 	xfs_flock64_t		*bf);

--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	void			__user *arg = (void __user *)p;
-	int			ioflags = 0;
 	int			error;

-	if (filp->f_mode & FMODE_NOCMTIME)
-		ioflags |= XFS_IO_INVIS;
-
 	trace_xfs_file_compat_ioctl(ip);

 	switch (cmd) {
@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
 		if (xfs_compat_flock64_copyin(&bf, arg))
 			return -EFAULT;
 		cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-		return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+		return xfs_ioc_space(filp, cmd, &bf);
 	}
 	case XFS_IOC_FSGEOMETRY_V1_32:
 		return xfs_compat_ioc_fsgeometry_v1(mp, arg);
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@ -15,6 +15,7 @@
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
@ -940,3 +941,173 @@ error_on_bmapi_transaction:
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
 }
+
+void
+xfs_bmbt_to_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (imap->br_startblock == HOLESTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_DELALLOC;
+	} else {
+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+}
+
+static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
+{
+	return !nimaps ||
+		imap->br_startblock == HOLESTARTBLOCK ||
+		imap->br_startblock == DELAYSTARTBLOCK;
+}
+
+static int
+xfs_file_iomap_begin(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	int			nimaps = 1, error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	ASSERT(offset <= mp->m_super->s_maxbytes);
+	if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
+		length = mp->m_super->s_maxbytes - offset;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+			       &nimaps, XFS_BMAPI_ENTIRE);
+	if (error) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return error;
+	}
+
+	if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
+		/*
+		 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+		 * pages to keep the chunks of work done where somewhat symmetric
+		 * with the work writeback does. This is a completely arbitrary
+		 * number pulled out of thin air as a best guess for initial
+		 * testing.
+		 *
+		 * Note that the values needs to be less than 32-bits wide until
+		 * the lower level functions are updated.
+		 */
+		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+		if (xfs_get_extsz_hint(ip)) {
+			/*
+			 * xfs_iomap_write_direct() expects the shared lock. It
+			 * is unlocked on return.
+			 */
+			xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+			error = xfs_iomap_write_direct(ip, offset, length, &imap,
+					nimaps);
+		} else {
+			error = xfs_iomap_write_delay(ip, offset, length, &imap);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+
+		if (error)
+			return error;
+
+		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else if (nimaps) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+		xfs_bmbt_to_iomap(ip, iomap, &imap);
+	} else {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->type = IOMAP_HOLE;
+		iomap->offset = offset;
+		iomap->length = length;
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end_delalloc(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		start_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error = 0;
+
+	start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+	/*
+	 * Trim back delalloc blocks if we didn't manage to write the whole
+	 * range reserved.
+	 *
+	 * We don't need to care about racing delalloc as we hold i_mutex
+	 * across the reserve/allocate/unreserve calls. If there are delalloc
+	 * blocks in the range, they are ours.
+	 */
+	if (start_fsb < end_fsb) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+					       end_fsb - start_fsb);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_alert(mp, "%s: unable to clean up ino %lld",
+				__func__, ip->i_ino);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+static int
+xfs_file_iomap_end(
+	struct inode		*inode,
+	loff_t			offset,
+	loff_t			length,
+	ssize_t			written,
+	unsigned		flags,
+	struct iomap		*iomap)
+{
+	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+				length, written);
+	return 0;
+}
+
+struct iomap_ops xfs_iomap_ops = {
+	.iomap_begin		= xfs_file_iomap_begin,
+	.iomap_end		= xfs_file_iomap_end,
+};
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@ -18,6 +18,8 @@
 #ifndef __XFS_IOMAP_H__
 #define __XFS_IOMAP_H__

+#include <linux/iomap.h>
+
 struct xfs_inode;
 struct xfs_bmbt_irec;

@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);

+void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+		struct xfs_bmbt_irec *);
+
+extern struct iomap_ops xfs_iomap_ops;
+
 #endif /* __XFS_IOMAP_H__*/
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@ -38,12 +38,13 @@
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
 #include "xfs_pnfs.h"
+#include "xfs_iomap.h"

 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/security.h>
-#include <linux/fiemap.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>

 /*
@ -800,21 +801,31 @@ xfs_setattr_size(
 	if (error)
 		return error;

+	/*
+	 * Wait for all direct I/O to complete.
+	 */
+	inode_dio_wait(inode);
+
 	/*
 	 * File data changes must be complete before we start the transaction to
 	 * modify the inode.  This needs to be done before joining the inode to
 	 * the transaction because the inode cannot be unlocked once it is a
 	 * part of the transaction.
 	 *
-	 * Start with zeroing any data block beyond EOF that we may expose on
-	 * file extension.
+	 * Start with zeroing any data beyond EOF that we may expose on file
+	 * extension, or zeroing out the rest of the block on a downward
+	 * truncate.
 	 */
 	if (newsize > oldsize) {
 		error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
-		if (error)
-			return error;
+	} else {
+		error = iomap_truncate_page(inode, newsize, &did_zeroing,
+				&xfs_iomap_ops);
 	}

+	if (error)
+		return error;
+
 	/*
 	 * We are going to log the inode size change in this transaction so
 	 * any previous writes that are beyond the on disk EOF and the new
@ -823,17 +834,14 @@ xfs_setattr_size(
 	 * problem. Note that this includes any block zeroing we did above;
 	 * otherwise those blocks may not be zeroed after a crash.
 	 */
-	if (newsize > ip->i_d.di_size &&
-	    (oldsize != ip->i_d.di_size || did_zeroing)) {
+	if (did_zeroing ||
+	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
 						      ip->i_d.di_size, newsize);
 		if (error)
 			return error;
 	}

-	/* Now wait for all direct I/O to complete. */
-	inode_dio_wait(inode);
-
 	/*
 	 * We've already locked out new page faults, so now we can safely remove
 	 * pages from the page cache knowing they won't get refaulted until we
@ -851,13 +859,6 @@ xfs_setattr_size(
 	 * to hope that the caller sees ENOMEM and retries the truncate
 	 * operation.
 	 */
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
-	else
-		error = block_truncate_page(inode->i_mapping, newsize,
-					    xfs_get_blocks);
-	if (error)
-		return error;
 	truncate_setsize(inode, newsize);

 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@ -998,51 +999,6 @@ xfs_vn_update_time(
 	return xfs_trans_commit(tp);
 }

-#define XFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-	void			**arg,
-	struct getbmapx		*bmv,
-	int			*full)
-{
-	int			error;
-	struct fiemap_extent_info *fieinfo = *arg;
-	u32			fiemap_flags = 0;
-	u64			logical, physical, length;
-
-	/* Do nothing for a hole */
-	if (bmv->bmv_block == -1LL)
-		return 0;
-
-	logical = BBTOB(bmv->bmv_offset);
-	physical = BBTOB(bmv->bmv_block);
-	length = BBTOB(bmv->bmv_length);
-
-	if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-		fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-	else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-		fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
-				 FIEMAP_EXTENT_UNKNOWN);
-		physical = 0;   /* no block yet */
-	}
-	if (bmv->bmv_oflags & BMV_OF_LAST)
-		fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-					length, fiemap_flags);
-	if (error > 0) {
-		error = 0;
-		*full = 1;	/* user array now full */
-	}
-
-	return error;
-}
-
 STATIC int
 xfs_vn_fiemap(
 	struct inode		*inode,
@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
 	u64			start,
 	u64			length)
 {
-	xfs_inode_t		*ip = XFS_I(inode);
-	struct getbmapx		bm;
 	int			error;

-	error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-	if (error)
-		return error;
+	xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
+	error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+	xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);

-	/* Set up bmap header for xfs internal routine */
-	bm.bmv_offset = BTOBBT(start);
-	/* Special case for whole file */
-	if (length == FIEMAP_MAX_OFFSET)
-		bm.bmv_length = -1LL;
-	else
-		bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
-
-	/* We add one because in getbmap world count includes the header */
-	bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-					fieinfo->fi_extents_max + 1;
-	bm.bmv_count = min_t(__s32, bm.bmv_count,
-			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-		bm.bmv_iflags |= BMV_IF_ATTRFORK;
-	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-		bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-	error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-	if (error)
-		return error;
-
-	return 0;
+	return error;
 }

 STATIC int
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 	return x;
 }

-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
 #define ASSERT_ALWAYS(expr)	\
 	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))

--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@ -788,7 +788,7 @@ xfs_log_mount_cancel(
 * As far as I know, there weren't any dependencies on the old behaviour.
 */

-int
+static int
 xfs_log_unmount_write(xfs_mount_t *mp)
 {
 	struct xlog	 *log = mp->m_log;
@ -1036,7 +1036,7 @@ xfs_log_space_wake(
 * there's no point in running a dummy transaction at this point because we
 * can't start trying to idle the log until both the CIL and AIL are empty.
 */
-int
+static int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
 	struct xlog	*log = mp->m_log;
@ -1177,7 +1177,7 @@ xlog_space_left(
 * The log manager needs its own routine, in order to control what
 * happens with the buffer after the write completes.
 */
-void
+static void
 xlog_iodone(xfs_buf_t *bp)
 {
 	struct xlog_in_core	*iclog = bp->b_fspriv;
@ -1302,7 +1302,7 @@ xfs_log_work_queue(
 * disk. If there is nothing dirty, then we might need to cover the log to
 * indicate that the filesystem is idle.
 */
-void
+static void
 xfs_log_worker(
 	struct work_struct	*work)
 {
@ -1415,7 +1415,7 @@ xlog_alloc_log(
 	 */
 	error = -ENOMEM;
 	bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
-			   BTOBB(log->l_iclog_size), 0);
+			   BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
 	if (!bp)
 		goto out_free_log;

@ -1454,7 +1454,8 @@ xlog_alloc_log(
 		prev_iclog = iclog;

 		bp = xfs_buf_get_uncached(mp->m_logdev_targp,
-						BTOBB(log->l_iclog_size), 0);
+					  BTOBB(log->l_iclog_size),
+					  XBF_NO_IOACCT);
 		if (!bp)
 			goto out_free_iclog;

--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@ -163,12 +163,8 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
 			  __uint8_t	   clientid,
 			  bool		   permanent);
 int	  xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-int	  xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
 int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
-int	  xfs_log_need_covered(struct xfs_mount *mp);
-
-void	  xlog_iodone(struct xfs_buf *);

 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
@ -178,7 +174,6 @@ void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);

 void	xfs_log_work_queue(struct xfs_mount *mp);
-void	xfs_log_worker(struct work_struct *work);
 void	xfs_log_quiesce(struct xfs_mount *mp);
 bool	xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);

--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
 	log->l_cilp->xc_ctx->sequence = 1;
 }

+static inline int
+xlog_cil_iovec_space(
+	uint	niovecs)
+{
+	return round_up((sizeof(struct xfs_log_vec) +
+					niovecs * sizeof(struct xfs_log_iovec)),
+			sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *	a) does not exist; or
+ *	b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+	struct xlog		*log,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item_desc *lidp;
+
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		struct xfs_log_item *lip = lidp->lid_item;
+		struct xfs_log_vec *lv;
+		int	niovecs = 0;
+		int	nbytes = 0;
+		int	buf_size;
+		bool	ordered = false;
+
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+
+		/* get number of vecs and size of data to be stored */
+		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+		/*
+		 * Ordered items need to be tracked but we do not wish to write
+		 * them. We need a logvec to track the object, but we do not
+		 * need an iovec or buffer to be allocated for copying data.
+		 */
+		if (niovecs == XFS_LOG_VEC_ORDERED) {
+			ordered = true;
+			niovecs = 0;
+			nbytes = 0;
+		}
+
+		/*
+		 * We 64-bit align the length of each iovec so that the start
+		 * of the next one is naturally aligned.  We'll need to
+		 * account for that slack space here. Then round nbytes up
+		 * to 64-bit alignment so that the initial buffer alignment is
+		 * easy to calculate and verify.
+		 */
+		nbytes += niovecs * sizeof(uint64_t);
+		nbytes = round_up(nbytes, sizeof(uint64_t));
+
+		/*
+		 * The data buffer needs to start 64-bit aligned, so round up
+		 * that space to ensure we can align it appropriately and not
+		 * overrun the buffer.
+		 */
+		buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+		/*
+		 * if we have no shadow buffer, or it is too small, we need to
+		 * reallocate it.
+		 */
+		if (!lip->li_lv_shadow ||
+		    buf_size > lip->li_lv_shadow->lv_size) {
+
+			/*
+			 * We free and allocate here as a realloc would copy
+			 * unecessary data. We don't use kmem_zalloc() for the
+			 * same reason - we don't need to zero the data area in
+			 * the buffer, only the log vector header and the iovec
+			 * storage.
+			 */
+			kmem_free(lip->li_lv_shadow);
+
+			lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+			memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+			lv->lv_item = lip;
+			lv->lv_size = buf_size;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+			lip->li_lv_shadow = lv;
+		} else {
+			/* same or smaller, optimise common overwrite case */
+			lv = lip->li_lv_shadow;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+			lv->lv_next = NULL;
+		}
+
+		/* Ensure the lv is set up according to ->iop_size */
+		lv->lv_niovecs = niovecs;
+
+		/* The allocated data region lies beyond the iovec region */
+		lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+	}
+
+}
+
 /*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space and vectors it will consume, and if it is a new item pin it as
@ -100,16 +251,19 @@ xfs_cil_prepare_item(
 	/*
 	 * If there is no old LV, this is the first time we've seen the item in
 	 * this CIL context and so we need to pin it. If we are replacing the
-	 * old_lv, then remove the space it accounts for and free it.
+	 * old_lv, then remove the space it accounts for and make it the shadow
+	 * buffer for later freeing. In both cases we are now switching to the
+	 * shadow buffer, so update the the pointer to it appropriately.
 	 */
-	if (!old_lv)
+	if (!old_lv) {
 		lv->lv_item->li_ops->iop_pin(lv->lv_item);
-	else if (old_lv != lv) {
+		lv->lv_item->li_lv_shadow = NULL;
+	} else if (old_lv != lv) {
 		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);

 		*diff_len -= old_lv->lv_bytes;
 		*diff_iovecs -= old_lv->lv_niovecs;
-		kmem_free(old_lv);
+		lv->lv_item->li_lv_shadow = old_lv;
 	}

 	/* attach new log vector to log item */
@ -133,11 +287,13 @@ xfs_cil_prepare_item(
 * write it out asynchronously without needing to relock the object that was
 * modified at the time it gets written into the iclog.
 *
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
 *
 * We don't set up region headers during this process; we simply copy the
 * regions into the flat buffer. We can do this because we still have to do a
@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
 	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		struct xfs_log_item *lip = lidp->lid_item;
 		struct xfs_log_vec *lv;
-		struct xfs_log_vec *old_lv;
-		int	niovecs = 0;
-		int	nbytes = 0;
-		int	buf_size;
+		struct xfs_log_vec *old_lv = NULL;
+		struct xfs_log_vec *shadow;
 		bool	ordered = false;

 		/* Skip items which aren't dirty in this transaction. */
 		if (!(lidp->lid_flags & XFS_LID_DIRTY))
 			continue;

-		/* get number of vecs and size of data to be stored */
-		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+		/*
+		 * The formatting size information is already attached to
+		 * the shadow lv on the log item.
+		 */
+		shadow = lip->li_lv_shadow;
+		if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
+			ordered = true;

 		/* Skip items that do not have any vectors for writing */
-		if (!niovecs)
+		if (!shadow->lv_niovecs && !ordered)
 			continue;

-		/*
-		 * Ordered items need to be tracked but we do not wish to write
-		 * them. We need a logvec to track the object, but we do not
-		 * need an iovec or buffer to be allocated for copying data.
-		 */
-		if (niovecs == XFS_LOG_VEC_ORDERED) {
-			ordered = true;
-			niovecs = 0;
-			nbytes = 0;
-		}
-
-		/*
-		 * We 64-bit align the length of each iovec so that the start
-		 * of the next one is naturally aligned.  We'll need to
-		 * account for that slack space here. Then round nbytes up
-		 * to 64-bit alignment so that the initial buffer alignment is
-		 * easy to calculate and verify.
-		 */
-		nbytes += niovecs * sizeof(uint64_t);
-		nbytes = round_up(nbytes, sizeof(uint64_t));
-
-		/* grab the old item if it exists for reservation accounting */
-		old_lv = lip->li_lv;
-
-		/*
-		 * The data buffer needs to start 64-bit aligned, so round up
-		 * that space to ensure we can align it appropriately and not
-		 * overrun the buffer.
-		 */
-		buf_size = nbytes +
-			   round_up((sizeof(struct xfs_log_vec) +
-				     niovecs * sizeof(struct xfs_log_iovec)),
-				    sizeof(uint64_t));
-
 		/* compare to existing item size */
-		if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+		old_lv = lip->li_lv;
+		if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
 			/* same or smaller, optimise common overwrite case */
 			lv = lip->li_lv;
 			lv->lv_next = NULL;
@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
 			 */
 			*diff_iovecs -= lv->lv_niovecs;
 			*diff_len -= lv->lv_bytes;
+
+			/* Ensure the lv is set up according to ->iop_size */
+			lv->lv_niovecs = shadow->lv_niovecs;
+
+			/* reset the lv buffer information for new formatting */
+			lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+			lv->lv_buf = (char *)lv +
+					xlog_cil_iovec_space(lv->lv_niovecs);
 		} else {
-			/* allocate new data chunk */
-			lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+			/* switch to shadow buffer! */
+			lv = shadow;
 			lv->lv_item = lip;
-			lv->lv_size = buf_size;
 			if (ordered) {
 				/* track as an ordered logvec */
 				ASSERT(lip->li_lv == NULL);
-				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
 				goto insert;
 			}
-			lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 		}

-		/* Ensure the lv is set up according to ->iop_size */
-		lv->lv_niovecs = niovecs;
-
-		/* The allocated data region lies beyond the iovec region */
-		lv->lv_buf_len = 0;
-		lv->lv_bytes = 0;
-		lv->lv_buf = (char *)lv + buf_size - nbytes;
 		ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
 		lip->li_ops->iop_format(lip, lv);
 insert:
-		ASSERT(lv->lv_buf_len <= nbytes);
 		xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
 	}
 }
@ -783,6 +906,13 @@ xfs_log_commit_cil(
 	struct xlog		*log = mp->m_log;
 	struct xfs_cil		*cil = log->l_cilp;

+	/*
+	 * Do all necessary memory allocation before we lock the CIL.
+	 * This ensures the allocation does not deadlock with a CIL
+	 * push in memory reclaim (e.g. from kswapd).
+	 */
+	xlog_cil_alloc_shadow_bufs(log, tp);
+
 	/* lock out background commit */
 	down_read(&cil->xc_ctx_lock);

--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@ -272,13 +272,15 @@ xfs_readsb(
 	buf_ops = NULL;

 	/*
-	 * Allocate a (locked) buffer to hold the superblock.
-	 * This will be kept around at all times to optimize
-	 * access to the superblock.
+	 * Allocate a (locked) buffer to hold the superblock. This will be kept
+	 * around at all times to optimize access to the superblock. Therefore,
+	 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
+	 * elevated.
 	 */
 reread:
 	error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-				   BTOBB(sector_size), 0, &bp, buf_ops);
+				      BTOBB(sector_size), XBF_NO_IOACCT, &bp,
+				      buf_ops);
 	if (error) {
 		if (loud)
 			xfs_warn(mp, "SB validate failed with error %d.", error);
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@ -22,6 +22,11 @@
 	BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
 		#structname ") is wrong, expected " #size)

+#define XFS_CHECK_OFFSET(structname, member, off) \
+	BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+		"XFS: offsetof(" #structname ", " #member ") is wrong, " \
+		"expected " #off)
+
 static inline void __init
 xfs_check_ondisk_structs(void)
 {
@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key,		8);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec,		16);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr,	48);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr,	64);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block,		72);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_dinode,		176);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot,		104);
@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t,	12);
 	 */

+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen,	0);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen,	2);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval,	3);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk,	0);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen,	4);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen,	8);
+	XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name,	9);
 	XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t,		40);
-	XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t,		8);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize,	0);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count,	2);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen,	4);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags,	6);
+	XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval,	7);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t,			12);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t,			16);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t,		8);
 	XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t,		16);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t,		16);
-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t,		6);
+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag,	0);
+	XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length,	2);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t,		16);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t,			16);
-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t,			4);
-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t,			8);
-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t,			8);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t,		8);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t,		16);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t,			16);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t,		3);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen,		0);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset,		1);
+	XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name,		3);
 	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t,		10);
-	XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t,		2);

 	/* log structures */
 	XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat,		24);
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@ -1,6 +1,7 @@
 /*
 * Copyright (c) 2014 Christoph Hellwig.
 */
+#include <linux/iomap.h>
 #include "xfs.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
@ -79,32 +80,6 @@ xfs_fs_get_uuid(
 	return 0;
 }

-static void
-xfs_bmbt_to_iomap(
-	struct xfs_inode	*ip,
-	struct iomap		*iomap,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	if (imap->br_startblock == HOLESTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_HOLE;
-	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
-		iomap->blkno = IOMAP_NULL_BLOCK;
-		iomap->type = IOMAP_DELALLOC;
-	} else {
-		iomap->blkno =
-			XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
-		if (imap->br_state == XFS_EXT_UNWRITTEN)
-			iomap->type = IOMAP_UNWRITTEN;
-		else
-			iomap->type = IOMAP_MAPPED;
-	}
-	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
-	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-}
-
 /*
 * Get a layout for the pNFS client.
 */
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@ -98,8 +98,6 @@ xfs_growfs_rt(
 /*
 * From xfs_rtbitmap.c
 */
-int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
-		  xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
 int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
 		      xfs_rtblock_t start, xfs_extlen_t len, int val,
 		      xfs_rtblock_t *new, int *stat);
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@ -546,7 +546,7 @@ xfs_showargs(

 	return 0;
 }
-__uint64_t
+static __uint64_t
 xfs_max_file_offset(
 	unsigned int		blockshift)
 {
@ -1294,6 +1294,7 @@ xfs_fs_remount(
 		 */
 		xfs_restore_resvblks(mp);
 		xfs_log_work_queue(mp);
+		xfs_queue_eofblocks(mp);
 	}

 	/* rw -> ro */
@ -1306,6 +1307,13 @@ xfs_fs_remount(
 		 * return it to the same size.
 		 */
 		xfs_save_resvblks(mp);
+
+		/*
+		 * Cancel background eofb scanning so it cannot race with the
+		 * final log force+buftarg wait and deadlock the remount.
+		 */
+		cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
 	}
@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
 		}
 	}

-	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
-		xfs_alert(mp,
-	"EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
@ -1692,8 +1696,9 @@ xfs_init_zones(void)
 	if (!xfs_log_ticket_zone)
 		goto out_free_ioend_bioset;

-	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-						"xfs_bmap_free_item");
+	xfs_bmap_free_item_zone = kmem_zone_init(
+			sizeof(struct xfs_bmap_free_item),
+			"xfs_bmap_free_item");
 	if (!xfs_bmap_free_item_zone)
 		goto out_destroy_log_ticket_zone;

--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@ -61,8 +61,6 @@ struct xfs_mount;
 struct xfs_buftarg;
 struct block_device;

-extern __uint64_t xfs_max_file_offset(unsigned int);
-
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@ -634,6 +634,9 @@ xfs_error_get_cfg(
 {
 	struct xfs_error_cfg	*cfg;

+	if (error < 0)
+		error = -error;
+
 	switch (error) {
 	case EIO:
 		cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
 DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
 DEFINE_BUF_EVENT(xfs_buf_trylock);
 DEFINE_BUF_EVENT(xfs_buf_unlock);
 DEFINE_BUF_EVENT(xfs_buf_iowait);
@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
 )

 DECLARE_EVENT_CLASS(xfs_file_class,
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-	TP_ARGS(ip, count, offset, flags),
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
+	TP_ARGS(ip, count, offset),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, ino)
 		__field(xfs_fsize_t, size)
 		__field(loff_t, offset)
 		__field(size_t, count)
-		__field(int, flags)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
 		__entry->size = ip->i_d.di_size;
 		__entry->offset = offset;
 		__entry->count = count;
-		__entry->flags = flags;
 	),
-	TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
-		  "offset 0x%llx count 0x%zx ioflags %s",
+	TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->size,
 		  __entry->offset,
-		  __entry->count,
-		  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+		  __entry->count)
 )

 #define DEFINE_RW_EVENT(name)		\
 DEFINE_EVENT(xfs_file_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),	\
-	TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
+	TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),	\
+	TP_ARGS(ip, count, offset))
+DEFINE_RW_EVENT(xfs_file_buffered_read);
+DEFINE_RW_EVENT(xfs_file_direct_read);
+DEFINE_RW_EVENT(xfs_file_dax_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_dax_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);

 DECLARE_EVENT_CLASS(xfs_page_class,
@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
+DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IOMAP_EVENT(xfs_iomap_found);
+DEFINE_IOMAP_EVENT(xfs_iomap_not_found);

 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@ -52,6 +52,7 @@ typedef struct xfs_log_item {
 	/* delayed logging */
 	struct list_head		li_cil;		/* CIL pointers */
 	struct xfs_log_vec		*li_lv;		/* active log vector */
+	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
 	xfs_lsn_t			li_seq;		/* CIL commit seq */
 } xfs_log_item_t;

--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@ -6,6 +6,7 @@
 struct dentry;
 struct iattr;
 struct inode;
+struct iomap;
 struct super_block;
 struct vfsmount;

@ -187,21 +188,6 @@ struct fid {
 *    get_name is not (which is possibly inconsistent)
 */

-/* types of block ranges for multipage write mappings. */
-#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
-#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
-#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
-
-#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
-
-struct iomap {
-	sector_t	blkno;	/* first sector of mapping */
-	loff_t		offset;	/* file offset of mapping, bytes */
-	u64		length;	/* length of mapping, bytes */
-	int		type;	/* type of mapping */
-};
-
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@ -0,0 +1,70 @@
+#ifndef LINUX_IOMAP_H
+#define LINUX_IOMAP_H 1
+
+#include <linux/types.h>
+
+struct fiemap_extent_info;
+struct inode;
+struct iov_iter;
+struct kiocb;
+struct vm_area_struct;
+struct vm_fault;
+
+/*
+ * Types of block ranges for iomap mappings:
+ */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+/*
+ * Magic value for blkno:
+ */
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t		blkno;	/* 1st sector of mapping, 512b units */
+	loff_t			offset;	/* file offset of mapping, bytes */
+	u64			length;	/* length of mapping, bytes */
+	int			type;	/* type of mapping */
+	struct block_device	*bdev;	/* block device for I/O */
+};
+
+/*
+ * Flags for iomap_begin / iomap_end.  No flag implies a read.
+ */
+#define IOMAP_WRITE		(1 << 0)
+#define IOMAP_ZERO		(1 << 1)
+
+struct iomap_ops {
+	/*
+	 * Return the existing mapping at pos, or reserve space starting at
+	 * pos for up to length, as long as we can do it as a single mapping.
+	 * The actual length is returned in iomap->length.
+	 */
+	int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
+			unsigned flags, struct iomap *iomap);
+
+	/*
+	 * Commit and/or unreserve space previous allocated using iomap_begin.
+	 * Written indicates the length of the successful write operation which
+	 * needs to be commited, while the rest needs to be unreserved.
+	 * Written might be zero if no data was written.
+	 */
+	int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
+			ssize_t written, unsigned flags, struct iomap *iomap);
+};
+
+ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+		struct iomap_ops *ops);
+int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
+		bool *did_zero, struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
+		struct iomap_ops *ops);
+int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+		struct iomap_ops *ops);
+int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		loff_t start, loff_t len, struct iomap_ops *ops);
+
+#endif /* LINUX_IOMAP_H */