From 85a7b33b9653ade7b26b9f29765cb1f0719c1e84 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Thu, 26 Jul 2012 23:39:10 +0200
Subject: [PATCH 001/121] Btrfs: add rdev to get_inode_info in send/receive

We need rdev in the next commit.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fb5ffe95f869..5721401548e8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -687,7 +687,8 @@ out:
  */
 static int get_inode_info(struct btrfs_root *root,
 			  u64 ino, u64 *size, u64 *gen,
-			  u64 *mode, u64 *uid, u64 *gid)
+			  u64 *mode, u64 *uid, u64 *gid,
+			  u64 *rdev)
 {
 	int ret;
 	struct btrfs_inode_item *ii;
@@ -721,6 +722,8 @@ static int get_inode_info(struct btrfs_root *root,
 		*uid = btrfs_inode_uid(path->nodes[0], ii);
 	if (gid)
 		*gid = btrfs_inode_gid(path->nodes[0], ii);
+	if (rdev)
+		*rdev = btrfs_inode_rdev(path->nodes[0], ii);
 
 out:
 	btrfs_free_path(path);
@@ -1081,7 +1084,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	 * There are inodes that have extents that lie behind it's i_size. Don't
 	 * accept clones from these extents.
 	 */
-	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL);
+	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+			NULL);
 	if (ret < 0)
 		return ret;
 
@@ -1404,7 +1408,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 	u64 right_gen;
 
 	ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0 && ret != -ENOENT)
 		goto out;
 	left_ret = ret;
@@ -1413,7 +1417,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 		right_ret = -ENOENT;
 	} else {
 		ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
-				NULL, NULL, NULL);
+				NULL, NULL, NULL, NULL);
 		if (ret < 0 && ret != -ENOENT)
 			goto out;
 		right_ret = ret;
@@ -1557,7 +1561,7 @@ static int get_first_ref(struct send_ctx *sctx,
 	btrfs_release_path(path);
 
 	ret = get_inode_info(root, found_key.offset, NULL, dir_gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -1628,7 +1632,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 
 	if (other_inode > sctx->send_progress) {
 		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
-				who_gen, NULL, NULL, NULL);
+				who_gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 
@@ -1671,7 +1675,7 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 	}
 
 	ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -2540,7 +2544,7 @@ int __finish_unordered_dir(int num, struct btrfs_key *di_key,
 	fs_path_reset(fctx->cur_path);
 
 	ret = get_inode_info(sctx->send_root, di_key->objectid,
-			NULL, &di_gen, &di_mode, NULL, NULL);
+			NULL, &di_gen, &di_mode, NULL, NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -2971,7 +2975,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 		return -ENOMEM;
 
 	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -3029,7 +3033,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
 		return -ENOMEM;
 
 	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-			NULL);
+			NULL, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -3642,7 +3646,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 
 	if (clone_root2 == sctx->send_root) {
 		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
-				&gen, NULL, NULL, NULL);
+				&gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 		ret = get_cur_path(sctx, clone_root->ino, gen, p);
@@ -4004,7 +4008,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		goto out;
 
 	ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
-			&left_mode, &left_uid, &left_gid);
+			&left_mode, &left_uid, &left_gid, NULL);
 	if (ret < 0)
 		goto out;
 
@@ -4015,7 +4019,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		} else {
 			ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
 					NULL, NULL, &right_mode, &right_uid,
-					&right_gid);
+					&right_gid, NULL);
 			if (ret < 0)
 				goto out;
 

From 1f4692da951af4179a6522c6b48a09a43d37e614 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 10:42:24 +0200
Subject: [PATCH 002/121] Btrfs: fix cur_ino < parent_ino case for send/receive

When the current inodes inum is smaller then the inum of the
parent directory strange things were happending due to wrong
path resolution and other bugs. Fix this with a new approach
for the problem.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 390 ++++++++++++++++++------------------------------
 1 file changed, 146 insertions(+), 244 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5721401548e8..a5fae484d4e1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -107,7 +107,6 @@ struct send_ctx {
 	int cur_inode_new;
 	int cur_inode_new_gen;
 	int cur_inode_deleted;
-	int cur_inode_first_ref_orphan;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
 
@@ -2296,25 +2295,25 @@ out:
  * a valid path yet because we did not process the refs yet. So, the inode
  * is created as orphan.
  */
-static int send_create_inode(struct send_ctx *sctx, struct btrfs_path *path,
-			     struct btrfs_key *key)
+static int send_create_inode(struct send_ctx *sctx, u64 ino)
 {
 	int ret = 0;
-	struct extent_buffer *eb = path->nodes[0];
-	struct btrfs_inode_item *ii;
 	struct fs_path *p;
-	int slot = path->slots[0];
 	int cmd;
+	u64 gen;
 	u64 mode;
+	u64 rdev;
 
-verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
+verbose_printk("btrfs: send_create_inode %llu\n", ino);
 
 	p = fs_path_alloc(sctx);
 	if (!p)
 		return -ENOMEM;
 
-	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
-	mode = btrfs_inode_mode(eb, ii);
+	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
+			NULL, &rdev);
+	if (ret < 0)
+		goto out;
 
 	if (S_ISREG(mode))
 		cmd = BTRFS_SEND_C_MKFILE;
@@ -2339,22 +2338,22 @@ verbose_printk("btrfs: send_create_inode %llu\n", sctx->cur_ino);
 	if (ret < 0)
 		goto out;
 
-	ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+	ret = gen_unique_name(sctx, ino, gen, p);
 	if (ret < 0)
 		goto out;
 
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, sctx->cur_ino);
+	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
 
 	if (S_ISLNK(mode)) {
 		fs_path_reset(p);
-		ret = read_symlink(sctx, sctx->send_root, sctx->cur_ino, p);
+		ret = read_symlink(sctx, sctx->send_root, ino, p);
 		if (ret < 0)
 			goto out;
 		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
 	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
 		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
-		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, btrfs_inode_rdev(eb, ii));
+		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, rdev);
 	}
 
 	ret = send_cmd(sctx);
@@ -2368,6 +2367,92 @@ out:
 	return ret;
 }
 
+/*
+ * We need some special handling for inodes that get processed before the parent
+ * directory got created. See process_recorded_refs for details.
+ * This function does the check if we already created the dir out of order.
+ */
+static int did_create_dir(struct send_ctx *sctx, u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key di_key;
+	struct extent_buffer *eb;
+	struct btrfs_dir_item *di;
+	int slot;
+
+	path = alloc_path_for_send();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = dir;
+	key.type = BTRFS_DIR_INDEX_KEY;
+	key.offset = 0;
+	while (1) {
+		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+				1, 0);
+		if (ret < 0)
+			goto out;
+		if (!ret) {
+			eb = path->nodes[0];
+			slot = path->slots[0];
+			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		}
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
+		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+
+		if (di_key.objectid < sctx->send_progress) {
+			ret = 1;
+			goto out;
+		}
+
+		key.offset = found_key.offset + 1;
+		btrfs_release_path(path);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Only creates the inode if it is:
+ * 1. Not a directory
+ * 2. Or a directory which was not created already due to out of order
+ *    directories. See did_create_dir and process_recorded_refs for details.
+ */
+static int send_create_inode_if_needed(struct send_ctx *sctx)
+{
+	int ret;
+
+	if (S_ISDIR(sctx->cur_inode_mode)) {
+		ret = did_create_dir(sctx, sctx->cur_ino);
+		if (ret < 0)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	ret = send_create_inode(sctx, sctx->cur_ino);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
 struct recorded_ref {
 	struct list_head list;
 	char *dir_path;
@@ -2517,160 +2602,6 @@ out:
 	return ret;
 }
 
-struct finish_unordered_dir_ctx {
-	struct send_ctx *sctx;
-	struct fs_path *cur_path;
-	struct fs_path *dir_path;
-	u64 dir_ino;
-	int need_delete;
-	int delete_pass;
-};
-
-int __finish_unordered_dir(int num, struct btrfs_key *di_key,
-			   const char *name, int name_len,
-			   const char *data, int data_len,
-			   u8 type, void *ctx)
-{
-	int ret = 0;
-	struct finish_unordered_dir_ctx *fctx = ctx;
-	struct send_ctx *sctx = fctx->sctx;
-	u64 di_gen;
-	u64 di_mode;
-	int is_orphan = 0;
-
-	if (di_key->objectid >= fctx->dir_ino)
-		goto out;
-
-	fs_path_reset(fctx->cur_path);
-
-	ret = get_inode_info(sctx->send_root, di_key->objectid,
-			NULL, &di_gen, &di_mode, NULL, NULL, NULL);
-	if (ret < 0)
-		goto out;
-
-	ret = is_first_ref(sctx, sctx->send_root, di_key->objectid,
-			fctx->dir_ino, name, name_len);
-	if (ret < 0)
-		goto out;
-	if (ret) {
-		is_orphan = 1;
-		ret = gen_unique_name(sctx, di_key->objectid, di_gen,
-				fctx->cur_path);
-	} else {
-		ret = get_cur_path(sctx, di_key->objectid, di_gen,
-				fctx->cur_path);
-	}
-	if (ret < 0)
-		goto out;
-
-	ret = fs_path_add(fctx->dir_path, name, name_len);
-	if (ret < 0)
-		goto out;
-
-	if (!fctx->delete_pass) {
-		if (S_ISDIR(di_mode)) {
-			ret = send_rename(sctx, fctx->cur_path,
-					fctx->dir_path);
-		} else {
-			ret = send_link(sctx, fctx->dir_path,
-					fctx->cur_path);
-			if (is_orphan)
-				fctx->need_delete = 1;
-		}
-	} else if (!S_ISDIR(di_mode)) {
-		ret = send_unlink(sctx, fctx->cur_path);
-	} else {
-		ret = 0;
-	}
-
-	fs_path_remove(fctx->dir_path);
-
-out:
-	return ret;
-}
-
-/*
- * Go through all dir items and see if we find refs which could not be created
- * in the past because the dir did not exist at that time.
- */
-static int finish_outoforder_dir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
-{
-	int ret = 0;
-	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *eb;
-	struct finish_unordered_dir_ctx fctx;
-	int slot;
-
-	path = alloc_path_for_send();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	memset(&fctx, 0, sizeof(fctx));
-	fctx.sctx = sctx;
-	fctx.cur_path = fs_path_alloc(sctx);
-	fctx.dir_path = fs_path_alloc(sctx);
-	if (!fctx.cur_path || !fctx.dir_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	fctx.dir_ino = dir;
-
-	ret = get_cur_path(sctx, dir, dir_gen, fctx.dir_path);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * We do two passes. The first links in the new refs and the second
-	 * deletes orphans if required. Deletion of orphans is not required for
-	 * directory inodes, as we always have only one ref and use rename
-	 * instead of link for those.
-	 */
-
-again:
-	key.objectid = dir;
-	key.type = BTRFS_DIR_ITEM_KEY;
-	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-				1, 0);
-		if (ret < 0)
-			goto out;
-		eb = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_item_key_to_cpu(eb, &found_key, slot);
-
-		if (found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
-			btrfs_release_path(path);
-			break;
-		}
-
-		ret = iterate_dir_item(sctx, sctx->send_root, path,
-				&found_key, __finish_unordered_dir,
-				&fctx);
-		if (ret < 0)
-			goto out;
-
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(path);
-	}
-
-	if (!fctx.delete_pass && fctx.need_delete) {
-		fctx.delete_pass = 1;
-		goto again;
-	}
-
-out:
-	btrfs_free_path(path);
-	fs_path_free(sctx, fctx.cur_path);
-	fs_path_free(sctx, fctx.dir_path);
-	return ret;
-}
-
 /*
  * This does all the move/link/unlink/rmdir magic.
  */
@@ -2678,6 +2609,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
 {
 	int ret = 0;
 	struct recorded_ref *cur;
+	struct recorded_ref *cur2;
 	struct ulist *check_dirs = NULL;
 	struct ulist_iterator uit;
 	struct ulist_node *un;
@@ -2734,6 +2666,46 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	}
 
 	list_for_each_entry(cur, &sctx->new_refs, list) {
+		/*
+		 * We may have refs where the parent directory does not exist
+		 * yet. This happens if the parent directories inum is higher
+		 * the the current inum. To handle this case, we create the
+		 * parent directory out of order. But we need to check if this
+		 * did already happen before due to other refs in the same dir.
+		 */
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+		if (ret < 0)
+			goto out;
+		if (ret == inode_state_will_create) {
+			ret = 0;
+			/*
+			 * First check if any of the current inodes refs did
+			 * already create the dir.
+			 */
+			list_for_each_entry(cur2, &sctx->new_refs, list) {
+				if (cur == cur2)
+					break;
+				if (cur2->dir == cur->dir) {
+					ret = 1;
+					break;
+				}
+			}
+
+			/*
+			 * If that did not happen, check if a previous inode
+			 * did already create the dir.
+			 */
+			if (!ret)
+				ret = did_create_dir(sctx, cur->dir);
+			if (ret < 0)
+				goto out;
+			if (!ret) {
+				ret = send_create_inode(sctx, cur->dir);
+				if (ret < 0)
+					goto out;
+			}
+		}
+
 		/*
 		 * Check if this new ref would overwrite the first ref of
 		 * another unprocessed inode. If yes, orphanize the
@@ -2768,7 +2740,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * inode, move it and update valid_path. If not, link or move
 		 * it depending on the inode mode.
 		 */
-		if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+		if (is_orphan) {
 			ret = send_rename(sctx, valid_path, cur->full_path);
 			if (ret < 0)
 				goto out;
@@ -2844,35 +2816,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (!ret) {
-				/*
-				 * In case the inode was moved to a directory
-				 * that was not created yet (see
-				 * __record_new_ref), we can not unlink the ref
-				 * as it will be needed later when the parent
-				 * directory is created, so that we can move in
-				 * the inode to the new dir.
-				 */
-				if (!is_orphan &&
-				    sctx->cur_inode_first_ref_orphan) {
-					ret = orphanize_inode(sctx,
-							sctx->cur_ino,
-							sctx->cur_inode_gen,
-							cur->full_path);
-					if (ret < 0)
-						goto out;
-					ret = gen_unique_name(sctx,
-							sctx->cur_ino,
-							sctx->cur_inode_gen,
-							valid_path);
-					if (ret < 0)
-						goto out;
-					is_orphan = 1;
-
-				} else {
-					ret = send_unlink(sctx, cur->full_path);
-					if (ret < 0)
-						goto out;
-				}
+				ret = send_unlink(sctx, cur->full_path);
+				if (ret < 0)
+					goto out;
 			}
 			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
 					GFP_NOFS);
@@ -2885,11 +2831,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * happen when a previous inode did overwrite the first ref
 		 * of this inode and no new refs were added for the current
 		 * inode.
-		 * We can however not delete the orphan in case the inode relies
-		 * in a directory that was not created yet (see
-		 * __record_new_ref)
 		 */
-		if (is_orphan && !sctx->cur_inode_first_ref_orphan) {
+		if (is_orphan) {
 			ret = send_unlink(sctx, valid_path);
 			if (ret < 0)
 				goto out;
@@ -2939,19 +2882,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 */
 	sctx->send_progress = sctx->cur_ino + 1;
 
-	/*
-	 * We may have a directory here that has pending refs which could not
-	 * be created before (because the dir did not exist before, see
-	 * __record_new_ref). finish_outoforder_dir will link/move the pending
-	 * refs.
-	 */
-	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_new) {
-		ret = finish_outoforder_dir(sctx, sctx->cur_ino,
-				sctx->cur_inode_gen);
-		if (ret < 0)
-			goto out;
-	}
-
 	ret = 0;
 
 out:
@@ -2979,31 +2909,6 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (ret < 0)
 		goto out;
 
-	/*
-	 * The parent may be non-existent at this point in time. This happens
-	 * if the ino of the parent dir is higher then the current ino. In this
-	 * case, we can not process this ref until the parent dir is finally
-	 * created. If we reach the parent dir later, process_recorded_refs
-	 * will go through all dir items and process the refs that could not be
-	 * processed before. In case this is the first ref, we set
-	 * cur_inode_first_ref_orphan to 1 to inform process_recorded_refs to
-	 * keep an orphan of the inode so that it later can be used for
-	 * link/move
-	 */
-	ret = is_inode_existent(sctx, dir, gen);
-	if (ret < 0)
-		goto out;
-	if (!ret) {
-		ret = is_first_ref(sctx, sctx->send_root, sctx->cur_ino, dir,
-				name->start, fs_path_len(name));
-		if (ret < 0)
-			goto out;
-		if (ret)
-			sctx->cur_inode_first_ref_orphan = 1;
-		ret = 0;
-		goto out;
-	}
-
 	ret = get_cur_path(sctx, dir, gen, p);
 	if (ret < 0)
 		goto out;
@@ -4078,7 +3983,6 @@ static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
-	sctx->cur_inode_first_ref_orphan = 0;
 	sctx->send_progress = sctx->cur_ino;
 
 	if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4115,8 +4019,7 @@ static int changed_inode(struct send_ctx *sctx,
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->left_path->nodes[0], left_ii);
 		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
-			ret = send_create_inode(sctx, sctx->left_path,
-					sctx->cmp_key);
+			ret = send_create_inode_if_needed(sctx);
 	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
 		sctx->cur_inode_gen = right_gen;
 		sctx->cur_inode_new = 0;
@@ -4146,8 +4049,7 @@ static int changed_inode(struct send_ctx *sctx,
 					sctx->left_path->nodes[0], left_ii);
 			sctx->cur_inode_mode = btrfs_inode_mode(
 					sctx->left_path->nodes[0], left_ii);
-			ret = send_create_inode(sctx, sctx->left_path,
-					sctx->cmp_key);
+			ret = send_create_inode_if_needed(sctx);
 			if (ret < 0)
 				goto out;
 

From b9291affaa4576762c30fb31083f33f5d745dea1 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:07:18 +0200
Subject: [PATCH 003/121] Btrfs: add missing check for dir != tmp_dir to
 is_first_ref

We missed that check which resultet in all refs with the same name
being reported as first_ref.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a5fae484d4e1..bea5b4378cc5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1589,7 +1589,7 @@ static int is_first_ref(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
-	if (name_len != fs_path_len(tmp_name)) {
+	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
 		ret = 0;
 		goto out;
 	}

From 9ea3ef516d828e435d283b7d5f0de05fd72a23ac Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:08:09 +0200
Subject: [PATCH 004/121] Btrfs: remove unused code with #if 0

fs_path_remove is not used at the moment due to a previous patch.
Remove it for now (with #if 0) to avoid compile warnings.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bea5b4378cc5..9e3f6d127d82 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -327,6 +327,7 @@ out:
 	return ret;
 }
 
+#if 0
 static void fs_path_remove(struct fs_path *p)
 {
 	BUG_ON(p->reversed);
@@ -334,6 +335,7 @@ static void fs_path_remove(struct fs_path *p)
 		p->end--;
 	*p->end = 0;
 }
+#endif
 
 static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 {

From ccf1626b49a94d66f3bb58d634888049ac695b46 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 11:46:29 +0200
Subject: [PATCH 005/121] Btrfs: add correct parent to check_dirs when dir got
 moved

We only added the parent for the new position of a moved dir.
We also need to add the old parent of the moved dir.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9e3f6d127d82..328654ea4400 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2805,6 +2805,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 		}
+	} else if (S_ISDIR(sctx->cur_inode_mode) &&
+		   !list_empty(&sctx->deleted_refs)) {
+		/*
+		 * We have a moved dir. Add the old parent to check_dirs
+		 */
+		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
+				list);
+		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
+				GFP_NOFS);
+		if (ret < 0)
+			goto out;
 	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
 		/*
 		 * We have a non dir inode. Go through all deleted refs and

From d27aed5e24f8e7bb2b0d11e7a579f2bbdebafc2f Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:04:16 +0200
Subject: [PATCH 006/121] Btrfs: remove unused use_list from send/receive code

use_list is a leftover and unused.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 328654ea4400..f1d44b125beb 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -125,7 +125,6 @@ struct send_ctx {
 
 struct name_cache_entry {
 	struct list_head list;
-	struct list_head use_list;
 	u64 ino;
 	u64 gen;
 	u64 parent_ino;
@@ -1906,7 +1905,6 @@ out_cache:
 	nce->name_len = fs_path_len(dest);
 	nce->ret = ret;
 	strcpy(nce->name, dest->start);
-	memset(&nce->use_list, 0, sizeof(nce->use_list));
 
 	if (ino < sctx->send_progress)
 		nce->need_later_update = 0;

From ee849c0472a9fe1dc09fe8390965d993b9c4e979 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:42:05 +0200
Subject: [PATCH 007/121] Btrfs: rename backref_ctx::found_in_send_root to
 found_itself

The new name should be easier to understand/read.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f1d44b125beb..a25be0c1a6b9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1029,7 +1029,7 @@ struct backref_ctx {
 	u64 extent_len;
 
 	/* Just to check for bugs in backref resolving */
-	int found_in_send_root;
+	int found_itself;
 };
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@@ -1077,7 +1077,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	if (found->root == bctx->sctx->send_root &&
 	    ino == bctx->cur_objectid &&
 	    offset == bctx->cur_offset) {
-		bctx->found_in_send_root = 1;
+		bctx->found_itself = 1;
 	}
 
 	/*
@@ -1210,7 +1210,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_ctx.found = 0;
 	backref_ctx.cur_objectid = ino;
 	backref_ctx.cur_offset = data_offset;
-	backref_ctx.found_in_send_root = 0;
+	backref_ctx.found_itself = 0;
 	backref_ctx.extent_len = num_bytes;
 
 	/*
@@ -1231,7 +1231,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
-	if (!backref_ctx.found_in_send_root) {
+	if (!backref_ctx.found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "

From 35075bb046cc91f42a0e5336bdc07f3279061add Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:44:34 +0200
Subject: [PATCH 008/121] Btrfs: use kmalloc instead of stack for backref_ctx

Make sure to never get in trouble due to the backref_ctx
which was on the stack before.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a25be0c1a6b9..36711123bdcf 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1149,7 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 extent_item_pos;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
-	struct backref_ctx backref_ctx;
+	struct backref_ctx *backref_ctx = NULL;
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
@@ -1159,6 +1159,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 	if (!tmp_path)
 		return -ENOMEM;
 
+	backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+	if (!backref_ctx) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	if (data_offset >= ino_size) {
 		/*
 		 * There may be extents that lie behind the file's size.
@@ -1206,12 +1212,12 @@ static int find_extent_clone(struct send_ctx *sctx,
 		cur_clone_root->found_refs = 0;
 	}
 
-	backref_ctx.sctx = sctx;
-	backref_ctx.found = 0;
-	backref_ctx.cur_objectid = ino;
-	backref_ctx.cur_offset = data_offset;
-	backref_ctx.found_itself = 0;
-	backref_ctx.extent_len = num_bytes;
+	backref_ctx->sctx = sctx;
+	backref_ctx->found = 0;
+	backref_ctx->cur_objectid = ino;
+	backref_ctx->cur_offset = data_offset;
+	backref_ctx->found_itself = 0;
+	backref_ctx->extent_len = num_bytes;
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
@@ -1219,7 +1225,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	 * __iterate_backrefs work.
 	 */
 	if (data_offset + num_bytes >= ino_size)
-		backref_ctx.extent_len = ino_size - data_offset;
+		backref_ctx->extent_len = ino_size - data_offset;
 
 	/*
 	 * Now collect all backrefs.
@@ -1227,11 +1233,11 @@ static int find_extent_clone(struct send_ctx *sctx,
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
-					__iterate_backrefs, &backref_ctx);
+					__iterate_backrefs, backref_ctx);
 	if (ret < 0)
 		goto out;
 
-	if (!backref_ctx.found_itself) {
+	if (!backref_ctx->found_itself) {
 		/* found a bug in backref code? */
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "
@@ -1246,7 +1252,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 		"num_bytes=%llu, logical=%llu\n",
 		data_offset, ino, num_bytes, logical);
 
-	if (!backref_ctx.found)
+	if (!backref_ctx->found)
 		verbose_printk("btrfs:    no clones found\n");
 
 	cur_clone_root = NULL;
@@ -1271,6 +1277,7 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 
 out:
 	btrfs_free_path(tmp_path);
+	kfree(backref_ctx);
 	return ret;
 }
 

From 52f9e53ede8e1b261e68216c6c2f32bb3f26c795 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:32:45 +0200
Subject: [PATCH 009/121] Btrfs: use normal return path for root == send_root
 case

Don't have a seperate return path for the mentioned case. Now
we do the same "take lowest inode/offset" logic for all found clones.

Commit is a result of Arne's review.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 36711123bdcf..d2a4ee9125df 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1109,12 +1109,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 			return 0;
 		if (offset + ctx->extent_len > ctx->cur_offset)
 			return 0;*/
-
-		bctx->found++;
-		found->found_refs++;
-		found->ino = ino;
-		found->offset = offset;
-		return 0;
 	}
 
 	bctx->found++;

From adbe7fb6c4750621a56867d9bb1980da3a4b8f33 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 12:51:32 +0200
Subject: [PATCH 010/121] Btrfs: don't break in the final loop of
 find_extent_clone

If we break, we may miss the clone from send_root which we prefer
over all other clones.

Commit is a result of Arne's review.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d2a4ee9125df..68b2543e5d6c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1257,7 +1257,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
 			else if (sctx->clone_roots[i].root == sctx->send_root)
 				/* prefer clones from send_root over others */
 				cur_clone_root = sctx->clone_roots + i;
-			break;
 		}
 
 	}

From 17589bd96eeb7c2ef2d3baeb05005a24932cd1e9 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:13:35 +0200
Subject: [PATCH 011/121] Btrfs: fix memory leak for name_cache in send/receive

When everything is done, name_cache_free is called which however
forgot to call kfree on the cache entries.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 68b2543e5d6c..9cee678c0fb6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1822,6 +1822,7 @@ static void name_cache_free(struct send_ctx *sctx)
 
 	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
 		name_cache_delete(sctx, nce);
+		kfree(nce);
 	}
 }
 

From 7e0926fe5f20b5c88e51cba68512302b10f73d2a Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:20:58 +0200
Subject: [PATCH 012/121] Btrfs: fix use of radix_tree for name_cache in
 send/receive

We can't easily use the index of the radix tree for inums as the
radix tree uses 32bit indexes on 32bit kernels. For 32bit kernels,
we now use the lower 32bit of the inum as index and an additional
list to store multiple entries per radix tree entry.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 74 ++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9cee678c0fb6..02e901adc3e8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -125,6 +125,15 @@ struct send_ctx {
 
 struct name_cache_entry {
 	struct list_head list;
+	/*
+	 * radix_tree has only 32bit entries but we need to handle 64bit inums.
+	 * We use the lower 32bit of the 64bit inum to store it in the tree. If
+	 * more then one inum would fall into the same entry, we use radix_list
+	 * to store the additional entries. radix_list is also used to store
+	 * entries where two entries have the same inum but different
+	 * generations.
+	 */
+	struct list_head radix_list;
 	u64 ino;
 	u64 gen;
 	u64 parent_ino;
@@ -1726,27 +1735,21 @@ static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
 {
 	int ret = 0;
-	struct name_cache_entry **ncea;
+	struct list_head *nce_head;
 
-	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-	if (ncea) {
-		if (!ncea[0])
-			ncea[0] = nce;
-		else if (!ncea[1])
-			ncea[1] = nce;
-		else
-			BUG();
-	} else {
-		ncea = kmalloc(sizeof(void *) * 2, GFP_NOFS);
-		if (!ncea)
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	if (!nce_head) {
+		nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+		if (!nce_head)
 			return -ENOMEM;
+		INIT_LIST_HEAD(nce_head);
 
-		ncea[0] = nce;
-		ncea[1] = NULL;
-		ret = radix_tree_insert(&sctx->name_cache, nce->ino, ncea);
+		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
 		if (ret < 0)
 			return ret;
 	}
+	list_add_tail(&nce->radix_list, nce_head);
 	list_add_tail(&nce->list, &sctx->name_cache_list);
 	sctx->name_cache_size++;
 
@@ -1756,41 +1759,36 @@ static int name_cache_insert(struct send_ctx *sctx,
 static void name_cache_delete(struct send_ctx *sctx,
 			      struct name_cache_entry *nce)
 {
-	struct name_cache_entry **ncea;
+	struct list_head *nce_head;
 
-	ncea = radix_tree_lookup(&sctx->name_cache, nce->ino);
-	BUG_ON(!ncea);
-
-	if (ncea[0] == nce)
-		ncea[0] = NULL;
-	else if (ncea[1] == nce)
-		ncea[1] = NULL;
-	else
-		BUG();
-
-	if (!ncea[0] && !ncea[1]) {
-		radix_tree_delete(&sctx->name_cache, nce->ino);
-		kfree(ncea);
-	}
+	nce_head = radix_tree_lookup(&sctx->name_cache,
+			(unsigned long)nce->ino);
+	BUG_ON(!nce_head);
 
+	list_del(&nce->radix_list);
 	list_del(&nce->list);
-
 	sctx->name_cache_size--;
+
+	if (list_empty(nce_head)) {
+		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
+		kfree(nce_head);
+	}
 }
 
 static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
 						    u64 ino, u64 gen)
 {
-	struct name_cache_entry **ncea;
+	struct list_head *nce_head;
+	struct name_cache_entry *cur;
 
-	ncea = radix_tree_lookup(&sctx->name_cache, ino);
-	if (!ncea)
+	nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
+	if (!nce_head)
 		return NULL;
 
-	if (ncea[0] && ncea[0]->gen == gen)
-		return ncea[0];
-	else if (ncea[1] && ncea[1]->gen == gen)
-		return ncea[1];
+	list_for_each_entry(cur, nce_head, radix_list) {
+		if (cur->ino == ino && cur->gen == gen)
+			return cur;
+	}
 	return NULL;
 }
 

From 34d73f54e2e2227cece751f168d08d3103092992 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:18:58 +0200
Subject: [PATCH 013/121] Btrfs: make aux field of ulist 64 bit

Btrfs send/receive uses the aux field to store inode numbers. On
32 bit machines this may become a problem.

Also fix all users of ulist_add and ulist_add_merged.

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/backref.c |  8 ++++----
 fs/btrfs/qgroup.c  | 20 ++++++++++----------
 fs/btrfs/ulist.c   |  7 +++----
 fs/btrfs/ulist.h   |  9 ++++-----
 4 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ff6475f409d6..6655ca615364 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			}
 			if (!ret) {
 				ret = ulist_add(parents, eb->start,
-						(unsigned long)eie, GFP_NOFS);
+						(u64)eie, GFP_NOFS);
 				if (ret < 0)
 					break;
 				if (!extent_item_pos) {
@@ -914,8 +914,8 @@ again:
 				free_extent_buffer(eb);
 			}
 			ret = ulist_add_merge(refs, ref->parent,
-					      (unsigned long)ref->inode_list,
-					      (unsigned long *)&eie, GFP_NOFS);
+					      (u64)ref->inode_list,
+					      (u64 *)&eie, GFP_NOFS);
 			if (!ret && extent_item_pos) {
 				/*
 				 * we've recorded that parent, so we must extend
@@ -1404,7 +1404,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&root_uiter);
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
-				 "%#lx\n", root_node->val, ref_node->val,
+				 "%#llx\n", root_node->val, ref_node->val,
 				 ref_node->aux);
 			ret = iterate_leaf_refs(
 				(struct extent_inode_elem *)ref_node->aux,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b65015581744..24f6adcdf339 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,7 +1145,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		ulist_reinit(tmp);
 						/* XXX id not needed */
-		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (unsigned long)glist->group,
+					  (u64)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1168,7 +1168,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	 * step 2: walk from the new root
 	 */
 	ulist_reinit(tmp);
-	ulist_add(tmp, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(tmp, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(tmp, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 
@@ -1208,7 +1208,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 			continue;
 
 		ulist_reinit(tmp);
-		ulist_add(tmp, qg->qgroupid, (unsigned long)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (unsigned long)glist->group,
+					  (u64)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1469,7 +1469,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 	if (ret)
@@ -1541,7 +1541,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (unsigned long)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
@@ -1553,7 +1553,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (unsigned long)glist->group, GFP_ATOMIC);
+				  (u64)glist->group, GFP_ATOMIC);
 		}
 	}
 
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index ab942f46b3dd..99be4c138db6 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -143,14 +143,13 @@ EXPORT_SYMBOL(ulist_free);
  * In case of allocation failure -ENOMEM is returned and the ulist stays
  * unaltered.
  */
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      gfp_t gfp_mask)
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
 {
 	return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
 }
 
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-		    unsigned long *old_aux, gfp_t gfp_mask)
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask)
 {
 	int i;
 
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 21bdc8ec8130..21a1963439c3 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -33,7 +33,7 @@ struct ulist_iterator {
  */
 struct ulist_node {
 	u64 val;		/* value to store */
-	unsigned long aux;	/* auxiliary value saved along with the val */
+	u64 aux;		/* auxiliary value saved along with the val */
 };
 
 struct ulist {
@@ -65,10 +65,9 @@ void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
 struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
-int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-	      gfp_t gfp_mask);
-int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
-		    unsigned long *old_aux, gfp_t gfp_mask);
+int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
+		    u64 *old_aux, gfp_t gfp_mask);
 struct ulist_node *ulist_next(struct ulist *ulist,
 			      struct ulist_iterator *uiter);
 

From e479d9bb5f8366a064915b3c9ac47ed6e9fcc7d3 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:09:35 +0200
Subject: [PATCH 014/121] Btrfs: update send_progress at correct places

Updating send_progress in process_recorded_refs was not correct.
It got updated too early in the cur_inode_new_gen case.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 02e901adc3e8..f618224d2326 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2886,12 +2886,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		}
 	}
 
-	/*
-	 * Current inode is now at it's new position, so we must increase
-	 * send_progress
-	 */
-	sctx->send_progress = sctx->cur_ino + 1;
-
 	ret = 0;
 
 out:
@@ -3896,6 +3890,15 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
 		goto out;
 
 	ret = process_recorded_refs(sctx);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * We have processed the refs and thus need to advance send_progress.
+	 * Now, calls to get_cur_xxx will take the updated refs of the current
+	 * inode into account.
+	 */
+	sctx->send_progress = sctx->cur_ino + 1;
 
 out:
 	return ret;
@@ -3993,6 +3996,12 @@ static int changed_inode(struct send_ctx *sctx,
 
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
+
+	/*
+	 * Set send_progress to current inode. This will tell all get_cur_xxx
+	 * functions that the current inode's refs are not updated yet. Later,
+	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
+	 */
 	sctx->send_progress = sctx->cur_ino;
 
 	if (result == BTRFS_COMPARE_TREE_NEW ||
@@ -4066,6 +4075,11 @@ static int changed_inode(struct send_ctx *sctx,
 			ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
 			if (ret < 0)
 				goto out;
+			/*
+			 * Advance send_progress now as we did not get into
+			 * process_recorded_refs_if_needed in the new_gen case.
+			 */
+			sctx->send_progress = sctx->cur_ino + 1;
 			ret = process_all_extents(sctx);
 			if (ret < 0)
 				goto out;

From 766702ef49b8b5299d819f3a0ac42613c23424d1 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 14:11:31 +0200
Subject: [PATCH 015/121] Btrfs: add/fix comments/documentation for
 send/receive

As the subject already said, add/fix comments.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 140 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f618224d2326..6926546939f6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1067,6 +1067,7 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
 
 /*
  * Called for every backref that is found for the current extent.
+ * Results are collected in sctx->clone_roots->ino/offset/found_refs
  */
 static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 {
@@ -1090,7 +1091,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	}
 
 	/*
-	 * There are inodes that have extents that lie behind it's i_size. Don't
+	 * There are inodes that have extents that lie behind its i_size. Don't
 	 * accept clones from these extents.
 	 */
 	ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
@@ -1137,6 +1138,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 }
 
 /*
+ * Given an inode, offset and extent item, it finds a good clone for a clone
+ * instruction. Returns -ENOENT when none could be found. The function makes
+ * sure that the returned clone is usable at the point where sending is at the
+ * moment. This means, that no clones are accepted which lie behind the current
+ * inode+offset.
+ *
  * path must point to the extent item when called.
  */
 static int find_extent_clone(struct send_ctx *sctx,
@@ -1529,6 +1536,10 @@ out:
 	return ret;
 }
 
+/*
+ * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
+ * generation of the parent dir and the name of the dir entry.
+ */
 static int get_first_ref(struct send_ctx *sctx,
 			 struct btrfs_root *root, u64 ino,
 			 u64 *dir, u64 *dir_gen, struct fs_path *name)
@@ -1615,6 +1626,16 @@ out:
 	return ret;
 }
 
+/*
+ * Used by process_recorded_refs to determine if a new ref would overwrite an
+ * already existing ref. In case it detects an overwrite, it returns the
+ * inode/gen in who_ino/who_gen.
+ * When an overwrite is detected, process_recorded_refs does proper orphanizing
+ * to make sure later references to the overwritten inode are possible.
+ * Orphanizing is however only required for the first ref of an inode.
+ * process_recorded_refs does an additional is_first_ref check to see if
+ * orphanizing is really required.
+ */
 static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 			      const char *name, int name_len,
 			      u64 *who_ino, u64 *who_gen)
@@ -1639,6 +1660,11 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 		goto out;
 	}
 
+	/*
+	 * Check if the overwritten ref was already processed. If yes, the ref
+	 * was already unlinked/moved, so we can safely assume that we will not
+	 * overwrite anything at this point in time.
+	 */
 	if (other_inode > sctx->send_progress) {
 		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
 				who_gen, NULL, NULL, NULL, NULL);
@@ -1655,6 +1681,13 @@ out:
 	return ret;
 }
 
+/*
+ * Checks if the ref was overwritten by an already processed inode. This is
+ * used by __get_cur_name_and_parent to find out if the ref was orphanized and
+ * thus the orphan name needs be used.
+ * process_recorded_refs also uses it to avoid unlinking of refs that were
+ * overwritten.
+ */
 static int did_overwrite_ref(struct send_ctx *sctx,
 			    u64 dir, u64 dir_gen,
 			    u64 ino, u64 ino_gen,
@@ -1703,6 +1736,11 @@ out:
 	return ret;
 }
 
+/*
+ * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
+ * that got overwritten. This is used by process_recorded_refs to determine
+ * if it has to use the path as returned by get_cur_path or the orphan name.
+ */
 static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 {
 	int ret = 0;
@@ -1731,6 +1769,11 @@ out:
 	return ret;
 }
 
+/*
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
+ * so we need to do some special handling in case we have clashes. This function
+ * takes care of this with the help of name_cache_entry::radix_list.
+ */
 static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
 {
@@ -1792,12 +1835,19 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
 	return NULL;
 }
 
+/*
+ * Removes the entry from the list and adds it back to the end. This marks the
+ * entry as recently used so that name_cache_clean_unused does not remove it.
+ */
 static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
 {
 	list_del(&nce->list);
 	list_add_tail(&nce->list, &sctx->name_cache_list);
 }
 
+/*
+ * Remove some entries from the beginning of name_cache_list.
+ */
 static void name_cache_clean_unused(struct send_ctx *sctx)
 {
 	struct name_cache_entry *nce;
@@ -1824,6 +1874,14 @@ static void name_cache_free(struct send_ctx *sctx)
 	}
 }
 
+/*
+ * Used by get_cur_path for each ref up to the root.
+ * Returns 0 if it succeeded.
+ * Returns 1 if the inode is not existent or got overwritten. In that case, the
+ * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
+ * is returned, parent_ino/parent_gen are not guaranteed to be valid.
+ * Returns <0 in case of error.
+ */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
 				     u64 ino, u64 gen,
 				     u64 *parent_ino,
@@ -1835,6 +1893,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	struct btrfs_path *path = NULL;
 	struct name_cache_entry *nce = NULL;
 
+	/*
+	 * First check if we already did a call to this function with the same
+	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
+	 * return the cached result.
+	 */
 	nce = name_cache_search(sctx, ino, gen);
 	if (nce) {
 		if (ino < sctx->send_progress && nce->need_later_update) {
@@ -1857,6 +1920,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	if (!path)
 		return -ENOMEM;
 
+	/*
+	 * If the inode is not existent yet, add the orphan name and return 1.
+	 * This should only happen for the parent dir that we determine in
+	 * __record_new_ref
+	 */
 	ret = is_inode_existent(sctx, ino, gen);
 	if (ret < 0)
 		goto out;
@@ -1869,6 +1937,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 		goto out_cache;
 	}
 
+	/*
+	 * Depending on whether the inode was already processed or not, use
+	 * send_root or parent_root for ref lookup.
+	 */
 	if (ino < sctx->send_progress)
 		ret = get_first_ref(sctx, sctx->send_root, ino,
 				parent_ino, parent_gen, dest);
@@ -1878,6 +1950,10 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * Check if the ref was overwritten by an inode's ref that was processed
+	 * earlier. If yes, treat as orphan and return 1.
+	 */
 	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
 			dest->start, dest->end - dest->start);
 	if (ret < 0)
@@ -1891,6 +1967,9 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	}
 
 out_cache:
+	/*
+	 * Store the result of the lookup in the name cache.
+	 */
 	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
 	if (!nce) {
 		ret = -ENOMEM;
@@ -2278,7 +2357,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 			btrfs_inode_mtime(ii));
 	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
 			btrfs_inode_ctime(ii));
-	/* TODO otime? */
+	/* TODO Add otime support when the otime patches get into upstream */
 
 	ret = send_cmd(sctx);
 
@@ -2520,7 +2599,7 @@ static void free_recorded_refs(struct send_ctx *sctx)
 }
 
 /*
- * Renames/moves a file/dir to it's orphan name. Used when the first
+ * Renames/moves a file/dir to its orphan name. Used when the first
  * ref of an unprocessed inode gets overwritten and for all non empty
  * directories.
  */
@@ -2840,7 +2919,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * If the inode is still orphan, unlink the orphan. This may
 		 * happen when a previous inode did overwrite the first ref
 		 * of this inode and no new refs were added for the current
-		 * inode.
+		 * inode. Unlinking does not mean that the inode is deleted in
+		 * all cases. There may still be links to this inode in other
+		 * places.
 		 */
 		if (is_orphan) {
 			ret = send_unlink(sctx, valid_path);
@@ -2857,6 +2938,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 */
 	ULIST_ITER_INIT(&uit);
 	while ((un = ulist_next(check_dirs, &uit))) {
+		/*
+		 * In case we had refs into dirs that were not processed yet,
+		 * we don't need to do the utime and rmdir logic for these dirs.
+		 * The dir will be processed later.
+		 */
 		if (un->val > sctx->cur_ino)
 			continue;
 
@@ -4048,7 +4134,17 @@ static int changed_inode(struct send_ctx *sctx,
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->right_path->nodes[0], right_ii);
 	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
+		/*
+		 * We need to do some special handling in case the inode was
+		 * reported as changed with a changed generation number. This
+		 * means that the original inode was deleted and new inode
+		 * reused the same inum. So we have to treat the old inode as
+		 * deleted and the new one as new.
+		 */
 		if (sctx->cur_inode_new_gen) {
+			/*
+			 * First, process the inode as if it was deleted.
+			 */
 			sctx->cur_inode_gen = right_gen;
 			sctx->cur_inode_new = 0;
 			sctx->cur_inode_deleted = 1;
@@ -4061,6 +4157,9 @@ static int changed_inode(struct send_ctx *sctx,
 			if (ret < 0)
 				goto out;
 
+			/*
+			 * Now process the inode as if it was new.
+			 */
 			sctx->cur_inode_gen = left_gen;
 			sctx->cur_inode_new = 1;
 			sctx->cur_inode_deleted = 0;
@@ -4080,6 +4179,11 @@ static int changed_inode(struct send_ctx *sctx,
 			 * process_recorded_refs_if_needed in the new_gen case.
 			 */
 			sctx->send_progress = sctx->cur_ino + 1;
+
+			/*
+			 * Now process all extents and xattrs of the inode as if
+			 * they were all new.
+			 */
 			ret = process_all_extents(sctx);
 			if (ret < 0)
 				goto out;
@@ -4102,6 +4206,16 @@ out:
 	return ret;
 }
 
+/*
+ * We have to process new refs before deleted refs, but compare_trees gives us
+ * the new and deleted refs mixed. To fix this, we record the new/deleted refs
+ * first and later process them in process_recorded_refs.
+ * For the cur_inode_new_gen case, we skip recording completely because
+ * changed_inode did already initiate processing of refs. The reason for this is
+ * that in this case, compare_tree actually compares the refs of 2 different
+ * inodes. To fix this, process_all_refs is used in changed_inode to handle all
+ * refs of the right tree as deleted and all refs of the left tree as new.
+ */
 static int changed_ref(struct send_ctx *sctx,
 		       enum btrfs_compare_tree_result result)
 {
@@ -4122,6 +4236,11 @@ static int changed_ref(struct send_ctx *sctx,
 	return ret;
 }
 
+/*
+ * Process new/deleted/changed xattrs. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of xattrs. The reason is the same as in changed_ref
+ */
 static int changed_xattr(struct send_ctx *sctx,
 			 enum btrfs_compare_tree_result result)
 {
@@ -4141,6 +4260,11 @@ static int changed_xattr(struct send_ctx *sctx,
 	return ret;
 }
 
+/*
+ * Process new/deleted/changed extents. We skip processing in the
+ * cur_inode_new_gen case because changed_inode did already initiate processing
+ * of extents. The reason is the same as in changed_ref
+ */
 static int changed_extent(struct send_ctx *sctx,
 			  enum btrfs_compare_tree_result result)
 {
@@ -4157,7 +4281,10 @@ static int changed_extent(struct send_ctx *sctx,
 	return ret;
 }
 
-
+/*
+ * Updates compare related fields in sctx and simply forwards to the actual
+ * changed_xxx functions.
+ */
 static int changed_cb(struct btrfs_root *left_root,
 		      struct btrfs_root *right_root,
 		      struct btrfs_path *left_path,
@@ -4229,7 +4356,8 @@ join_trans:
 	}
 
 	/*
-	 * Make sure the tree has not changed
+	 * Make sure the tree has not changed after re-joining. We detect this
+	 * by comparing start_ctransid and ctransid. They should always match.
 	 */
 	spin_lock(&send_root->root_times_lock);
 	ctransid = btrfs_root_ctransid(&send_root->root_item);

From e938c8ad543cd5ffe344371747484303ad51dfba Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Sat, 28 Jul 2012 16:33:49 +0200
Subject: [PATCH 016/121] Btrfs: code cleanups for send/receive

Doing some code cleanups as suggested by Arne.
Changes do not change any logic.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 83 +++++++++++++++++++++----------------------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6926546939f6..3bc921aa5e21 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1115,10 +1115,12 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 		 */
 		if (ino >= bctx->cur_objectid)
 			return 0;
-		/*if (ino > ctx->cur_objectid)
+#if 0
+		if (ino > bctx->cur_objectid)
 			return 0;
-		if (offset + ctx->extent_len > ctx->cur_offset)
-			return 0;*/
+		if (offset + bctx->extent_len > bctx->cur_offset)
+			return 0;
+#endif
 	}
 
 	bctx->found++;
@@ -1327,8 +1329,6 @@ static int read_symlink(struct send_ctx *sctx,
 	len = btrfs_file_extent_inline_len(path->nodes[0], ei);
 
 	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
-	if (ret < 0)
-		goto out;
 
 out:
 	btrfs_free_path(path);
@@ -1440,9 +1440,9 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
 	}
 
 	if (!left_ret && !right_ret) {
-		if (left_gen == gen && right_gen == gen)
+		if (left_gen == gen && right_gen == gen) {
 			ret = inode_state_no_change;
-		else if (left_gen == gen) {
+		} else if (left_gen == gen) {
 			if (ino < sctx->send_progress)
 				ret = inode_state_did_create;
 			else
@@ -1615,11 +1615,7 @@ static int is_first_ref(struct send_ctx *sctx,
 		goto out;
 	}
 
-	ret = memcmp(tmp_name->start, name, name_len);
-	if (ret)
-		ret = 0;
-	else
-		ret = 1;
+	ret = !memcmp(tmp_name->start, name, name_len);
 
 out:
 	fs_path_free(sctx, tmp_name);
@@ -1761,8 +1757,6 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
 
 	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
 			name->start, fs_path_len(name));
-	if (ret < 0)
-		goto out;
 
 out:
 	fs_path_free(sctx, name);
@@ -1866,9 +1860,10 @@ static void name_cache_clean_unused(struct send_ctx *sctx)
 static void name_cache_free(struct send_ctx *sctx)
 {
 	struct name_cache_entry *nce;
-	struct name_cache_entry *tmp;
 
-	list_for_each_entry_safe(nce, tmp, &sctx->name_cache_list, list) {
+	while (!list_empty(&sctx->name_cache_list)) {
+		nce = list_entry(sctx->name_cache_list.next,
+				struct name_cache_entry, list);
 		name_cache_delete(sctx, nce);
 		kfree(nce);
 	}
@@ -2188,9 +2183,6 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
 	btrfs_release_path(path);
 
-	if (ret < 0)
-		goto out;
-
 	if (parent_root) {
 		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
 		if (ret < 0)
@@ -2393,19 +2385,19 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	if (ret < 0)
 		goto out;
 
-	if (S_ISREG(mode))
+	if (S_ISREG(mode)) {
 		cmd = BTRFS_SEND_C_MKFILE;
-	else if (S_ISDIR(mode))
+	} else if (S_ISDIR(mode)) {
 		cmd = BTRFS_SEND_C_MKDIR;
-	else if (S_ISLNK(mode))
+	} else if (S_ISLNK(mode)) {
 		cmd = BTRFS_SEND_C_SYMLINK;
-	else if (S_ISCHR(mode) || S_ISBLK(mode))
+	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
 		cmd = BTRFS_SEND_C_MKNOD;
-	else if (S_ISFIFO(mode))
+	} else if (S_ISFIFO(mode)) {
 		cmd = BTRFS_SEND_C_MKFIFO;
-	else if (S_ISSOCK(mode))
+	} else if (S_ISSOCK(mode)) {
 		cmd = BTRFS_SEND_C_MKSOCK;
-	else {
+	} else {
 		printk(KERN_WARNING "btrfs: unexpected inode type %o",
 				(int)(mode & S_IFMT));
 		ret = -ENOTSUPP;
@@ -2583,13 +2575,13 @@ static int record_ref(struct list_head *head, u64 dir,
 static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
 {
 	struct recorded_ref *cur;
-	struct recorded_ref *tmp;
 
-	list_for_each_entry_safe(cur, tmp, head, list) {
+	while (!list_empty(head)) {
+		cur = list_entry(head->next, struct recorded_ref, list);
 		fs_path_free(sctx, cur->full_path);
+		list_del(&cur->list);
 		kfree(cur);
 	}
-	INIT_LIST_HEAD(head);
 }
 
 static void free_recorded_refs(struct send_ctx *sctx)
@@ -3205,24 +3197,18 @@ static int process_all_refs(struct send_ctx *sctx,
 	key.offset = 0;
 	while (1) {
 		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0) {
-			btrfs_release_path(path);
+		if (ret < 0)
 			goto out;
-		}
-		if (ret) {
-			btrfs_release_path(path);
+		if (ret)
 			break;
-		}
 
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
-			btrfs_release_path(path);
+		    found_key.type != key.type)
 			break;
-		}
 
 		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
 				&found_key, 0, cb, sctx);
@@ -3232,6 +3218,7 @@ static int process_all_refs(struct send_ctx *sctx,
 
 		key.offset = found_key.offset + 1;
 	}
+	btrfs_release_path(path);
 
 	ret = process_recorded_refs(sctx);
 
@@ -3554,7 +3541,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 	int ret = 0;
 	struct fs_path *p;
 	loff_t pos = offset;
-	int readed = 0;
+	int num_read = 0;
 	mm_segment_t old_fs;
 
 	p = fs_path_alloc(sctx);
@@ -3579,8 +3566,8 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
 	if (ret < 0)
 		goto out;
-	readed = ret;
-	if (!readed)
+	num_read = ret;
+	if (!num_read)
 		goto out;
 
 	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
@@ -3593,7 +3580,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
-	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, readed);
+	TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
 
 	ret = send_cmd(sctx);
 
@@ -3603,7 +3590,7 @@ out:
 	set_fs(old_fs);
 	if (ret < 0)
 		return ret;
-	return readed;
+	return num_read;
 }
 
 /*
@@ -3614,7 +3601,6 @@ static int send_clone(struct send_ctx *sctx,
 		      struct clone_root *clone_root)
 {
 	int ret = 0;
-	struct btrfs_root *clone_root2 = clone_root->root;
 	struct fs_path *p;
 	u64 gen;
 
@@ -3639,22 +3625,23 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
 
-	if (clone_root2 == sctx->send_root) {
+	if (clone_root->root == sctx->send_root) {
 		ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
 				&gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
 			goto out;
 		ret = get_cur_path(sctx, clone_root->ino, gen, p);
 	} else {
-		ret = get_inode_path(sctx, clone_root2, clone_root->ino, p);
+		ret = get_inode_path(sctx, clone_root->root,
+				clone_root->ino, p);
 	}
 	if (ret < 0)
 		goto out;
 
 	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-			clone_root2->root_item.uuid);
+			clone_root->root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
-			clone_root2->root_item.ctransid);
+			clone_root->root->root_item.ctransid);
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
 			clone_root->offset);

From 3e126f32f88095ad1bd01b3c451e52aa9094f45c Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:03:09 +0200
Subject: [PATCH 017/121] Btrfs: remove unused tmp_path from iterate_dir_item

A leftover from older code and unused now.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3bc921aa5e21..b0f9df30f24d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -864,7 +864,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 	struct extent_buffer *eb;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
-	struct btrfs_path *tmp_path = NULL;
 	struct btrfs_key di_key;
 	char *buf = NULL;
 	char *buf2 = NULL;
@@ -886,12 +885,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 		goto out;
 	}
 
-	tmp_path = alloc_path_for_send();
-	if (!tmp_path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	eb = path->nodes[0];
 	slot = path->slots[0];
 	item = btrfs_item_nr(eb, slot);
@@ -953,7 +946,6 @@ static int iterate_dir_item(struct send_ctx *sctx,
 	}
 
 out:
-	btrfs_free_path(tmp_path);
 	if (buf_virtual)
 		vfree(buf);
 	else

From 5dc67d0ba915cda01ed3a980502945edf2c46b70 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:07:43 +0200
Subject: [PATCH 018/121] Btrfs: free nce and nce_head on error in
 name_cache_insert

Both were leaked in case of error.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b0f9df30f24d..cfbe987f854b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1759,6 +1759,7 @@ out:
  * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
  * so we need to do some special handling in case we have clashes. This function
  * takes care of this with the help of name_cache_entry::radix_list.
+ * In case of error, nce is kfreed.
  */
 static int name_cache_insert(struct send_ctx *sctx,
 			     struct name_cache_entry *nce)
@@ -1775,8 +1776,11 @@ static int name_cache_insert(struct send_ctx *sctx,
 		INIT_LIST_HEAD(nce_head);
 
 		ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
-		if (ret < 0)
+		if (ret < 0) {
+			kfree(nce_head);
+			kfree(nce);
 			return ret;
+		}
 	}
 	list_add_tail(&nce->radix_list, nce_head);
 	list_add_tail(&nce->list, &sctx->name_cache_list);

From 3954096d4b1adab6fbb3c26de37ddd4f781e072f Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:46:05 +0200
Subject: [PATCH 019/121] Btrfs: fix check for changed extent in
 is_extent_unchanged

The previous check was working fine, but this check should be
easier to read. Also, we could theoritically have some exotic
bugs with the previous checks.

Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index cfbe987f854b..35222d5420f6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3819,8 +3819,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		/*
 		 * Check if we have the same extent.
 		 */
-		if (left_disknr + left_offset_fixed !=
-				right_disknr + right_offset) {
+		if (left_disknr != right_disknr ||
+		    left_offset_fixed != right_offset) {
 			ret = 0;
 			goto out;
 		}

From d8347fa444c682dc8a1e24e7158bfa2dd4c9ca71 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 12:49:15 +0200
Subject: [PATCH 020/121] Btrfs: use <= instead of < in is_extent_unchanged

Used the wrong compare operator here.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 35222d5420f6..c6c10a43153c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3802,7 +3802,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * Are we at extent 8? If yes, we know the extent is changed.
 		 * This may only happen on the first iteration.
 		 */
-		if (found_key.offset + right_len < ekey->offset) {
+		if (found_key.offset + right_len <= ekey->offset) {
 			ret = 0;
 			goto out;
 		}

From 2f28f4787c0fc37c508cfb6b7b11c00ce5072940 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:42:14 +0200
Subject: [PATCH 021/121] Btrfs: pass root instead of parent_root to
 iterate_inode_ref

We need to pass the root that we determined earlier to iterate_inode_ref.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c6c10a43153c..db051d17b0be 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3206,8 +3206,8 @@ static int process_all_refs(struct send_ctx *sctx,
 		    found_key.type != key.type)
 			break;
 
-		ret = iterate_inode_ref(sctx, sctx->parent_root, path,
-				&found_key, 0, cb, sctx);
+		ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb,
+				sctx);
 		btrfs_release_path(path);
 		if (ret < 0)
 			goto out;

From 2981e225f7048b36470383bf5622c42139bd96f7 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:47:03 +0200
Subject: [PATCH 022/121] Btrfs: ignore non-FS inodes for send/receive

We have to ignore inode/space cache objects in send/receive.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index db051d17b0be..a4011a9148fd 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4287,6 +4287,11 @@ static int changed_cb(struct btrfs_root *left_root,
 	if (ret < 0)
 		goto out;
 
+	/* Ignore non-FS objects */
+	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
+	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
+		goto out;
+
 	if (key->type == BTRFS_INODE_ITEM_KEY)
 		ret = changed_inode(sctx, result);
 	else if (key->type == BTRFS_INODE_REF_KEY)

From 6d85ed05e16e7ff747025c8374f23d7d81c98540 Mon Sep 17 00:00:00 2001
From: Alexander Block <ablock84@googlemail.com>
Date: Wed, 1 Aug 2012 14:48:59 +0200
Subject: [PATCH 023/121] Btrfs: don't treat top/root directory inode as
 deleted/reused

We can't do the deleted/reused logic for top/root inodes as it would
create a stream that tries to delete and recreate the root dir.

Reported-by: Alex Lyakas <alex.bolshoy.btrfs@gmail.com>
Signed-off-by: Alexander Block <ablock84@googlemail.com>
---
 fs/btrfs/send.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a4011a9148fd..d17d75ebc482 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2627,6 +2627,12 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 	struct btrfs_key loc;
 	struct btrfs_dir_item *di;
 
+	/*
+	 * Don't try to rmdir the top/root subvolume dir.
+	 */
+	if (dir == BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+
 	path = alloc_path_for_send();
 	if (!path)
 		return -ENOMEM;
@@ -2687,6 +2693,12 @@ static int process_recorded_refs(struct send_ctx *sctx)
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
+	/*
+	 * This should never happen as the root dir always has the same ref
+	 * which is always '..'
+	 */
+	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+
 	valid_path = fs_path_alloc(sctx);
 	if (!valid_path) {
 		ret = -ENOMEM;
@@ -4094,7 +4106,14 @@ static int changed_inode(struct send_ctx *sctx,
 
 		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
 				right_ii);
-		if (left_gen != right_gen)
+
+		/*
+		 * The cur_ino = root dir case is special here. We can't treat
+		 * the inode as deleted+reused because it would generate a
+		 * stream that tries to delete/mkdir the root dir.
+		 */
+		if (left_gen != right_gen &&
+		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
 			sctx->cur_inode_new_gen = 1;
 	}
 

From 74dd17fbe3d65829e75d84f00a9525b2ace93998 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@fusionio.com>
Date: Tue, 7 Aug 2012 16:25:13 -0400
Subject: [PATCH 024/121] Btrfs: fix btrfs send for inline items and
 compression

The btrfs send code was assuming the offset of the file item into the
extent translated to bytes on disk.  If we're compressed, this isn't
true, and so it was off into extents owned by other files.

It was also improperly handling inline extents.  This solves a crash
where we may have gone past the end of the file extent item by not
testing early enough for an inline extent.  It also solves problems
where we have a whole between the end of the inline item and the start
of the full extent.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c     |  5 ++++-
 fs/btrfs/extent_io.c |  1 -
 fs/btrfs/send.c      | 46 +++++++++++++++++++++++++++++++-------------
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d183f60d63a..fcc8c21a6233 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5073,6 +5073,7 @@ static void tree_move_down(struct btrfs_root *root,
 			   struct btrfs_path *path,
 			   int *level, int root_level)
 {
+	BUG_ON(*level == 0);
 	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
 					path->slots[*level]);
 	path->slots[*level - 1] = 0;
@@ -5089,7 +5090,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root,
 
 	path->slots[*level]++;
 
-	while (path->slots[*level] == nritems) {
+	while (path->slots[*level] >= nritems) {
 		if (*level == root_level)
 			return -1;
 
@@ -5433,9 +5434,11 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 					goto out;
 				advance_right = ADVANCE;
 			} else {
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
 				ret = tree_compare_item(left_root, left_path,
 						right_path, tmp_buf);
 				if (ret) {
+					WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
 					ret = changed_cb(left_root, right_root,
 						left_path, right_path,
 						&left_key,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c878476bb91..19319f5a91a8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4332,7 +4332,6 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 
 		/* Should be safe to release our pages at this point */
 		btrfs_release_extent_buffer_page(eb, 0);
-
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
 		return 1;
 	}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d17d75ebc482..f22fdd41ff49 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1149,6 +1149,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	int ret;
 	int extent_type;
 	u64 logical;
+	u64 disk_byte;
 	u64 num_bytes;
 	u64 extent_item_pos;
 	struct btrfs_file_extent_item *fi;
@@ -1157,6 +1158,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	struct clone_root *cur_clone_root;
 	struct btrfs_key found_key;
 	struct btrfs_path *tmp_path;
+	int compressed;
 	u32 i;
 
 	tmp_path = alloc_path_for_send();
@@ -1186,17 +1188,18 @@ static int find_extent_clone(struct send_ctx *sctx,
 		ret = -ENOENT;
 		goto out;
 	}
+	compressed = btrfs_file_extent_compression(eb, fi);
 
 	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
-	logical = btrfs_file_extent_disk_bytenr(eb, fi);
-	if (logical == 0) {
+	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+	if (disk_byte == 0) {
 		ret = -ENOENT;
 		goto out;
 	}
-	logical += btrfs_file_extent_offset(eb, fi);
+	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
 	ret = extent_from_logical(sctx->send_root->fs_info,
-			logical, tmp_path, &found_key);
+			disk_byte, tmp_path, &found_key);
 	btrfs_release_path(tmp_path);
 
 	if (ret < 0)
@@ -1234,10 +1237,16 @@ static int find_extent_clone(struct send_ctx *sctx,
 	/*
 	 * Now collect all backrefs.
 	 */
+	if (compressed == BTRFS_COMPRESS_NONE)
+		extent_item_pos = logical - found_key.objectid;
+	else
+		extent_item_pos = 0;
+
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
 					__iterate_backrefs, backref_ctx);
+
 	if (ret < 0)
 		goto out;
 
@@ -1246,8 +1255,8 @@ static int find_extent_clone(struct send_ctx *sctx,
 		ret = -EIO;
 		printk(KERN_ERR "btrfs: ERROR did not find backref in "
 				"send_root. inode=%llu, offset=%llu, "
-				"logical=%llu\n",
-				ino, data_offset, logical);
+				"disk_byte=%llu found extent=%llu\n",
+				ino, data_offset, disk_byte, found_key.objectid);
 		goto out;
 	}
 
@@ -3678,10 +3687,17 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			struct btrfs_file_extent_item);
 	type = btrfs_file_extent_type(path->nodes[0], ei);
-	if (type == BTRFS_FILE_EXTENT_INLINE)
+	if (type == BTRFS_FILE_EXTENT_INLINE) {
 		len = btrfs_file_extent_inline_len(path->nodes[0], ei);
-	else
+		/*
+		 * it is possible the inline item won't cover the whole page,
+		 * but there may be items after this page.  Make
+		 * sure to send the whole thing
+		 */
+		len = PAGE_CACHE_ALIGN(len);
+	} else {
 		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+	}
 
 	if (offset + len > sctx->cur_inode_size)
 		len = sctx->cur_inode_size - offset;
@@ -3729,6 +3745,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	u64 left_offset_fixed;
 	u64 left_len;
 	u64 right_len;
+	u64 left_gen;
+	u64 right_gen;
 	u8 left_type;
 	u8 right_type;
 
@@ -3738,17 +3756,17 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 
 	eb = left_path->nodes[0];
 	slot = left_path->slots[0];
-
 	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 	left_type = btrfs_file_extent_type(eb, ei);
-	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-	left_len = btrfs_file_extent_num_bytes(eb, ei);
-	left_offset = btrfs_file_extent_offset(eb, ei);
 
 	if (left_type != BTRFS_FILE_EXTENT_REG) {
 		ret = 0;
 		goto out;
 	}
+	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+	left_len = btrfs_file_extent_num_bytes(eb, ei);
+	left_offset = btrfs_file_extent_offset(eb, ei);
+	left_gen = btrfs_file_extent_generation(eb, ei);
 
 	/*
 	 * Following comments will refer to these graphics. L is the left
@@ -3804,6 +3822,7 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
 		right_len = btrfs_file_extent_num_bytes(eb, ei);
 		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
 
 		if (right_type != BTRFS_FILE_EXTENT_REG) {
 			ret = 0;
@@ -3832,7 +3851,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * Check if we have the same extent.
 		 */
 		if (left_disknr != right_disknr ||
-		    left_offset_fixed != right_offset) {
+		    left_offset_fixed != right_offset ||
+		    left_gen != right_gen) {
 			ret = 0;
 			goto out;
 		}

From 995e01b7af745b8aaa5e882cfb7bfd5baab3f335 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Aug 2012 02:52:38 -0600
Subject: [PATCH 025/121] Btrfs: fix gcc warnings for 32bit compiles

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c   | 25 +++++++++++++------------
 fs/btrfs/file-item.c |  2 +-
 fs/btrfs/qgroup.c    | 32 ++++++++++++++++----------------
 fs/btrfs/send.c      |  4 ++--
 4 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6655ca615364..e600857d3ca4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -231,7 +231,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			}
 			if (!ret) {
 				ret = ulist_add(parents, eb->start,
-						(u64)eie, GFP_NOFS);
+						(uintptr_t)eie, GFP_NOFS);
 				if (ret < 0)
 					break;
 				if (!extent_item_pos) {
@@ -363,8 +363,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&uiter);
 		node = ulist_next(parents, &uiter);
 		ref->parent = node ? node->val : 0;
-		ref->inode_list =
-			node ? (struct extent_inode_elem *)node->aux : 0;
+		ref->inode_list = node ?
+			(struct extent_inode_elem *)(uintptr_t)node->aux : 0;
 
 		/* additional parents require new refs being added here */
 		while ((node = ulist_next(parents, &uiter))) {
@@ -375,8 +375,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
-			new_ref->inode_list =
-					(struct extent_inode_elem *)node->aux;
+			new_ref->inode_list = (struct extent_inode_elem *)
+							(uintptr_t)node->aux;
 			list_add(&new_ref->list, &ref->list);
 		}
 		ulist_reinit(parents);
@@ -914,7 +914,7 @@ again:
 				free_extent_buffer(eb);
 			}
 			ret = ulist_add_merge(refs, ref->parent,
-					      (u64)ref->inode_list,
+					      (uintptr_t)ref->inode_list,
 					      (u64 *)&eie, GFP_NOFS);
 			if (!ret && extent_item_pos) {
 				/*
@@ -959,7 +959,7 @@ static void free_leaf_list(struct ulist *blocks)
 	while ((node = ulist_next(blocks, &uiter))) {
 		if (!node->aux)
 			continue;
-		eie = (struct extent_inode_elem *)node->aux;
+		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
 		for (; eie; eie = eie_next) {
 			eie_next = eie->next;
 			kfree(eie);
@@ -1405,11 +1405,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
 				 "%#llx\n", root_node->val, ref_node->val,
-				 ref_node->aux);
-			ret = iterate_leaf_refs(
-				(struct extent_inode_elem *)ref_node->aux,
-				root_node->val, extent_item_objectid,
-				iterate, ctx);
+				 (long long)ref_node->aux);
+			ret = iterate_leaf_refs((struct extent_inode_elem *)
+						(uintptr_t)ref_node->aux,
+						root_node->val,
+						extent_item_objectid,
+						iterate, ctx);
 		}
 		ulist_free(roots);
 		roots = NULL;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 857d93cd01dc..54f03354bed1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,7 +25,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define __MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 24f6adcdf339..9b707ba5c6c4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1145,12 +1145,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		ulist_reinit(tmp);
 						/* XXX id not needed */
-		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
 			if (qg->refcnt < seq)
 				qg->refcnt = seq + 1;
 			else
@@ -1158,7 +1158,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (u64)glist->group,
+					  (u64)(uintptr_t)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1168,13 +1168,13 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 	 * step 2: walk from the new root
 	 */
 	ulist_reinit(tmp);
-	ulist_add(tmp, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(tmp, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 		if (qg->refcnt < seq) {
 			/* not visited by step 1 */
 			qg->rfer += sgn * node->num_bytes;
@@ -1190,7 +1190,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(tmp, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 
@@ -1208,12 +1208,12 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 			continue;
 
 		ulist_reinit(tmp);
-		ulist_add(tmp, qg->qgroupid, (u64)qg, GFP_ATOMIC);
+		ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
 
-			qg = (struct btrfs_qgroup *)tmp_unode->aux;
+			qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
 			if (qg->tag == seq)
 				continue;
 
@@ -1225,7 +1225,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
 
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ulist_add(tmp, glist->group->qgroupid,
-					  (u64)glist->group,
+					  (uintptr_t)glist->group,
 					  GFP_ATOMIC);
 			}
 		}
@@ -1469,13 +1469,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
 		    qg->reserved + qg->rfer + num_bytes >
@@ -1489,7 +1489,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 	if (ret)
@@ -1502,7 +1502,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		qg->reserved += num_bytes;
 	}
@@ -1541,19 +1541,19 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
-	ulist_add(ulist, qgroup->qgroupid, (u64)qgroup, GFP_ATOMIC);
+	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
 		struct btrfs_qgroup *qg;
 		struct btrfs_qgroup_list *glist;
 
-		qg = (struct btrfs_qgroup *)unode->aux;
+		qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
 
 		qg->reserved -= num_bytes;
 
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ulist_add(ulist, glist->group->qgroupid,
-				  (u64)glist->group, GFP_ATOMIC);
+				  (uintptr_t)glist->group, GFP_ATOMIC);
 		}
 	}
 
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f22fdd41ff49..e5c867996aa0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1035,7 +1035,7 @@ struct backref_ctx {
 
 static int __clone_root_cmp_bsearch(const void *key, const void *elt)
 {
-	u64 root = (u64)key;
+	u64 root = (u64)(uintptr_t)key;
 	struct clone_root *cr = (struct clone_root *)elt;
 
 	if (root < cr->root->objectid)
@@ -1069,7 +1069,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	u64 i_size;
 
 	/* First check if the root is in the list of accepted clone sources */
-	found = bsearch((void *)root, bctx->sctx->clone_roots,
+	found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
 			bctx->sctx->clone_roots_cnt,
 			sizeof(struct clone_root),
 			__clone_root_cmp_bsearch);

From 7c735313bd1277c2eb28421934d4c7a0fa7339f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 13 Aug 2012 15:43:26 -0400
Subject: [PATCH 026/121] Btrfs: update last trans if we don't update the inode

There is a completely impossible situation to hit where you can preallocate
a file, fsync it, write into the preallocated region, have the transaction
commit twice and then fsync and then immediately lose power and lose all of
the contents of the write.  This patch fixes this just so I feel better
about the situation and because it is lightweight, we just update the
last_trans when we finish an ordered IO and we don't update the inode
itself.  This way we are completely safe and I feel better.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec154f954646..6971bac66d9d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1949,6 +1949,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			btrfs_abort_transaction(trans, root, ret);
 			goto out_unlock;
 		}
+	} else {
+		btrfs_set_inode_last_trans(trans, inode);
 	}
 	ret = 0;
 out_unlock:

From 54338b5cc4fa3cfe260e8e4ade8b62a9079ea3f9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 14 Aug 2012 16:20:52 -0400
Subject: [PATCH 027/121] Btrfs: do not allocate chunks as agressively

Swinging this pendulum back the other way.  We've been allocating chunks up
to 2% of the disk no matter how much we actually have allocated.  So instead
fix this calculation to only allocate chunks if we have more than 80% of the
space available allocated.  Please test this as it will likely cause all
sorts of ENOSPC problems to pop up suddenly.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba58024d40d3..62fc92f2b9d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3504,7 +3504,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * and purposes it's used space.  Don't worry about locking the
 	 * global_rsv, it doesn't change except when the transaction commits.
 	 */
-	num_allocated += global_rsv->size;
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+		num_allocated += global_rsv->size;
 
 	/*
 	 * in limited mode, we want to have some free space up to
@@ -3518,15 +3519,8 @@ static int should_alloc_chunk(struct btrfs_root *root,
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
-	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-	/* 256MB or 2% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
-	/* system chunks need a much small threshold */
-	if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		thresh = 32 * 1024 * 1024;
-
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
+	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }

From 224ecce517af3a952321202cdf304c12e138caca Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 16 Aug 2012 16:32:06 -0400
Subject: [PATCH 028/121] Btrfs: fix possible corruption when fsyncing written
 prealloced extents

While working on my fsync patch my fsync tester kept hitting mismatching
md5sums when I would randomly write to a prealloc'ed region, syncfs() and
then write to the prealloced region some more and then fsync() and then
immediately reboot.  This is because the tree logging code will skip writing
csums for file extents who's generation is less than the current running
transaction.  When we mark extents as written we haven't been updating their
generation so they were always being skipped.  This wouldn't happen if you
were to preallocate and then write in the same transaction, but if you for
example prealloced a VM you could definitely run into this problem.  This
patch makes my fsync tester happy again.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/file.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5caf285c6e4d..b7c885c8423f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -935,12 +935,16 @@ again:
 			btrfs_set_item_key_safe(trans, root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     end - orig_offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
 			btrfs_mark_buffer_dirty(leaf);
@@ -958,12 +962,16 @@ again:
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
 			btrfs_set_item_key_safe(trans, root, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
@@ -991,12 +999,14 @@ again:
 		leaf = path->nodes[0];
 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						split - key.offset);
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
@@ -1056,12 +1066,14 @@ again:
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
 		btrfs_mark_buffer_dirty(leaf);

From 5dc562c541e1026df9d43913c2f6b91156e22d32 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 17 Aug 2012 13:14:17 -0400
Subject: [PATCH 029/121] Btrfs: turbo charge fsync

At least for the vm workload.  Currently on fsync we will

1) Truncate all items in the log tree for the given inode if they exist

and

2) Copy all items for a given inode into the log

The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing.  This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them.  We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already.  Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write

		Original	Patched
SATA drive	82KB/s		140KB/s
Fusion drive	431KB/s		2532KB/s

So around 2-6 times faster depending on your hardware.  There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok.  This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get.  All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.

The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok.  I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/btrfs_inode.h |   1 +
 fs/btrfs/ctree.h       |  12 ++-
 fs/btrfs/extent_map.c  |  34 ++++++-
 fs/btrfs/extent_map.h  |   5 +-
 fs/btrfs/file.c        |  62 ++++++++----
 fs/btrfs/inode.c       | 120 ++++++++++++++++++++--
 fs/btrfs/ioctl.c       |   4 +-
 fs/btrfs/tree-log.c    | 220 +++++++++++++++++++++++++++++++++++++++--
 8 files changed, 416 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5b2ad6bc4fe7..7c7bf818f3c1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -38,6 +38,7 @@
 #define BTRFS_INODE_DELALLOC_META_RESERVED	4
 #define BTRFS_INODE_HAS_ORPHAN_ITEM		5
 #define BTRFS_INODE_HAS_ASYNC_EXTENT		6
+#define BTRFS_INODE_NEEDS_FULL_SYNC		7
 
 /* in memory btrfs inode */
 struct btrfs_inode {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0d195b507660..4b81ea3fa1b2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3315,9 +3315,17 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+			       u64 start, u64 end, int skip_pinned,
+			       int modified);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *hint_byte, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, u64 *hint_byte, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7c97b3301459..1fe82cfc1d93 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -35,6 +35,7 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
 	tree->map = RB_ROOT;
+	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
 
@@ -54,7 +55,9 @@ struct extent_map *alloc_extent_map(void)
 	em->in_tree = 0;
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
+	em->generation = 0;
 	atomic_set(&em->refs, 1);
+	INIT_LIST_HEAD(&em->list);
 	return em;
 }
 
@@ -72,6 +75,7 @@ void free_extent_map(struct extent_map *em)
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
+		WARN_ON(!list_empty(&em->list));
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
@@ -198,6 +202,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
+			if (merge->generation > em->generation) {
+				em->generation = merge->generation;
+				list_move(&em->list, &tree->modified_extents);
+			}
+
+			list_del_init(&merge->list);
 			rb_erase(&merge->rb_node, &tree->map);
 			free_extent_map(merge);
 		}
@@ -211,11 +221,29 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
+		if (merge->generation > em->generation) {
+			em->generation = merge->generation;
+			list_move(&em->list, &tree->modified_extents);
+		}
+		list_del_init(&merge->list);
 		free_extent_map(merge);
 	}
 }
 
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpint_extent_cache - unpin an extent from the cache
+ * @tree:	tree to unpin the extent in
+ * @start:	logical offset in the file
+ * @len:	length of the extent
+ * @gen:	generation that this extent has been modified in
+ * @prealloc:	if this is set we need to clear the prealloc flag
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+		       u64 gen)
 {
 	int ret = 0;
 	struct extent_map *em;
@@ -228,10 +256,11 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
 	if (!em)
 		goto out;
 
+	list_move(&em->list, &tree->modified_extents);
+	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 	try_merge_map(tree, em);
-
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -358,6 +387,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
+	list_del_init(&em->list);
 	em->in_tree = 0;
 	return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 1195f09761fe..2388a60bd6e3 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -23,15 +23,18 @@ struct extent_map {
 	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
+	u64 generation;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
 	unsigned int in_tree;
 	unsigned int compress_type;
+	struct list_head list;
 };
 
 struct extent_map_tree {
 	struct rb_root map;
+	struct list_head modified_extents;
 	rwlock_t lock;
 };
 
@@ -60,7 +63,7 @@ struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b7c885c8423f..399f9d71a926 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -459,13 +459,14 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  * [start, end].  Existing extents are split as required.
  */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned)
+			       int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	u64 gen;
 	int ret;
 	int testend = 1;
 	unsigned long flags;
@@ -490,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			break;
 		}
 		flags = em->flags;
+		gen = em->generation;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
@@ -518,12 +520,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->block_len = em->block_len;
 			else
 				split->block_len = split->len;
-
+			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret); /* Logic error */
+			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
@@ -537,6 +540,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
+			split->generation = gen;
 
 			if (compressed) {
 				split->block_len = em->block_len;
@@ -550,6 +554,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret); /* Logic error */
+			list_move(&split->list, &em_tree->modified_extents);
 			free_extent_map(split);
 			split = NULL;
 		}
@@ -576,13 +581,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *hint_byte, int drop_cache)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
-	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 ino = btrfs_ino(inode);
@@ -597,14 +602,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 	int recow;
 	int ret;
 	int modify_tree = -1;
+	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	if (start >= BTRFS_I(inode)->disk_i_size)
 		modify_tree = 0;
 
@@ -707,7 +709,7 @@ next_slot:
 							extent_end - start);
 			btrfs_mark_buffer_dirty(leaf);
 
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
@@ -734,7 +736,7 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				inode_sub_bytes(inode, end - key.offset);
 				*hint_byte = disk_bytenr;
 			}
@@ -753,7 +755,7 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				inode_sub_bytes(inode, extent_end - start);
 				*hint_byte = disk_bytenr;
 			}
@@ -777,12 +779,13 @@ next_slot:
 				del_nr++;
 			}
 
-			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (update_refs &&
+			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
-			} else if (disk_bytenr > 0) {
+			} else if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
@@ -806,7 +809,7 @@ next_slot:
 					      del_nr);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
-				goto out;
+				break;
 			}
 
 			del_nr = 0;
@@ -825,7 +828,22 @@ next_slot:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 
-out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, u64 *hint_byte, int drop_cache)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
+				   hint_byte, drop_cache);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -892,8 +910,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 ino = btrfs_ino(inode);
 
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1556,6 +1572,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	    BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
+
+		/*
+		 * We'v had everything committed since the last time we were
+		 * modified so clear this flag in case it was set for whatever
+		 * reason, it's no longer relevant.
+		 */
+		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			  &BTRFS_I(inode)->runtime_flags);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6971bac66d9d..1b99fe8a129d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -247,7 +247,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
 				 &hint_byte, 1);
 	if (ret)
 		return ret;
@@ -1803,7 +1803,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes,
 				 &hint, 0);
 	if (ret)
 		goto out;
@@ -1929,11 +1930,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 						ordered_extent->len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered_extent->file_offset,
-				   ordered_extent->len);
 	}
-
+	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+			   ordered_extent->file_offset, ordered_extent->len,
+			   trans->transid);
 	if (ret < 0) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_unlock;
@@ -2592,6 +2592,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+	/*
+	 * If we were modified in the current generation and evicted from memory
+	 * and then re-read we need to do a full sync since we don't have any
+	 * idea about which extents were modified before we were evicted from
+	 * cache.
+	 */
+	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
@@ -3269,8 +3281,13 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = -1;
 
+	/*
+	 * We want to drop from the next block forward in case this new size is
+	 * not block aligned since we will be keeping the last block of the
+	 * extent just the way it is.
+	 */
 	if (root->ref_cows || root == root->fs_info->tree_root)
-		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+		btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
 
 	/*
 	 * This function is also used to drop the items in the log tree before
@@ -3579,6 +3596,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (oldsize + mask) & ~mask;
 	u64 block_end = (size + mask) & ~mask;
@@ -3615,6 +3633,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+			struct extent_map *hole_em;
 			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
@@ -3624,7 +3643,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				break;
 			}
 
-			err = btrfs_drop_extents(trans, inode, cur_offset,
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
 						 cur_offset + hole_size,
 						 &hint_byte, 1);
 			if (err) {
@@ -3643,9 +3663,39 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 				break;
 			}
 
-			btrfs_drop_extent_cache(inode, hole_start,
-					last_byte - 1, 0);
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + hole_size - 1, 0);
+			hole_em = alloc_extent_map();
+			if (!hole_em) {
+				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					&BTRFS_I(inode)->runtime_flags);
+				goto next;
+			}
+			hole_em->start = cur_offset;
+			hole_em->len = hole_size;
+			hole_em->orig_start = cur_offset;
 
+			hole_em->block_start = EXTENT_MAP_HOLE;
+			hole_em->block_len = 0;
+			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+			hole_em->compress_type = BTRFS_COMPRESS_NONE;
+			hole_em->generation = trans->transid;
+
+			while (1) {
+				write_lock(&em_tree->lock);
+				err = add_extent_mapping(em_tree, hole_em);
+				if (!err)
+					list_move(&hole_em->list,
+						  &em_tree->modified_extents);
+				write_unlock(&em_tree->lock);
+				if (err != -EEXIST)
+					break;
+				btrfs_drop_extent_cache(inode, cur_offset,
+							cur_offset +
+							hole_size - 1, 0);
+			}
+			free_extent_map(hole_em);
+next:
 			btrfs_update_inode(trans, root, inode);
 			btrfs_end_transaction(trans, root);
 		}
@@ -4673,6 +4723,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
 
+	/*
+	 * We could have gotten an inode number from somebody who was fsynced
+	 * and then removed in this same transaction, so let's just set full
+	 * sync since it will be a full sync anyway and this will blow away the
+	 * old info in the log.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
 	if (S_ISDIR(mode))
 		owner = 0;
 	else
@@ -6839,6 +6897,15 @@ static int btrfs_truncate(struct inode *inode)
 					   &BTRFS_I(inode)->runtime_flags))
 		btrfs_add_ordered_operation(trans, root, inode);
 
+	/*
+	 * So if we truncate and then write and fsync we normally would just
+	 * write the extents that changed, which is a problem if we need to
+	 * first truncate that entire inode.  So set this flag so we write out
+	 * all of the extents in the inode to the sync log so we're completely
+	 * safe.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+
 	while (1) {
 		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 		if (ret) {
@@ -7510,6 +7577,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				       loff_t actual_len, u64 *alloc_hint,
 				       struct btrfs_trans_handle *trans)
 {
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
@@ -7550,6 +7619,37 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
+		em = alloc_extent_map();
+		if (!em) {
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+			goto next;
+		}
+
+		em->start = cur_offset;
+		em->orig_start = cur_offset;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->generation = trans->transid;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			if (!ret)
+				list_move(&em->list,
+					  &em_tree->modified_extents);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + ins.offset - 1,
+						0);
+		}
+		free_extent_map(em);
+next:
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		*alloc_hint = ins.objectid + ins.offset;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa8a078..95223222d5ad 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2576,7 +2576,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					datal -= off - key.offset;
 				}
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
 							 &hint_byte, 1);
@@ -2650,7 +2650,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
 							 &hint_byte, 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c86670f4f285..f2ff02c55130 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
@@ -550,7 +551,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, inode, start, extent_end,
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
 				 &alloc_hint, 1);
 	BUG_ON(ret);
 
@@ -2803,6 +2804,194 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct extent_map *em1, *em2;
+
+	em1 = list_entry(a, struct extent_map, list);
+	em2 = list_entry(b, struct extent_map, list);
+
+	if (em1->start < em2->start)
+		return -1;
+	else if (em1->start > em2->start)
+		return 1;
+	return 0;
+}
+
+struct log_args {
+	struct extent_buffer *src;
+	u64 next_offset;
+	int start_slot;
+	int nr;
+};
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+			  struct inode *inode, struct btrfs_root *root,
+			  struct extent_map *em, struct btrfs_path *path,
+			  struct btrfs_path *dst_path, struct log_args *args)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 start = em->start;
+	u64 len = em->len;
+	u64 num_bytes;
+	int nritems;
+	int ret;
+
+	if (BTRFS_I(inode)->logged_trans == trans->transid) {
+		u64 tmp;
+		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
+					   start + len, &tmp, 0);
+		if (ret)
+			return ret;
+	}
+
+	while (len) {
+		if (args->nr)
+			goto next_slot;
+		key.objectid = btrfs_ino(inode);
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			return ret;
+		if (ret) {
+			/*
+			 * This shouldn't happen, but it might so warn and
+			 * return an error.
+			 */
+			WARN_ON(1);
+			return -ENOENT;
+		}
+		args->src = path->nodes[0];
+next_slot:
+		fi = btrfs_item_ptr(args->src, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (args->nr &&
+		    args->start_slot + args->nr == path->slots[0]) {
+			args->nr++;
+		} else if (args->nr) {
+			ret = copy_items(trans, log, dst_path, args->src,
+					 args->start_slot, args->nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				return ret;
+			args->nr = 1;
+			args->start_slot = path->slots[0];
+		} else if (!args->nr) {
+			args->nr = 1;
+			args->start_slot = path->slots[0];
+		}
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
+		if (len < num_bytes) {
+			/* I _think_ this is ok, envision we write to a
+			 * preallocated space that is adjacent to a previously
+			 * written preallocated space that gets merged when we
+			 * mark this preallocated space written.  If we do not
+			 * have the adjacent extent in cache then when we copy
+			 * this extent it could end up being larger than our EM
+			 * thinks it is, which is a-ok, so just set len to 0.
+			 */
+			len = 0;
+		} else {
+			len -= num_bytes;
+		}
+		start += btrfs_file_extent_num_bytes(args->src, fi);
+		args->next_offset = start;
+
+		if (path->slots[0] < nritems) {
+			if (len)
+				goto next_slot;
+			break;
+		}
+
+		if (args->nr) {
+			ret = copy_items(trans, log, dst_path, args->src,
+					 args->start_slot, args->nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				return ret;
+			args->nr = 0;
+			btrfs_release_path(path);
+		}
+	}
+
+	return 0;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *inode,
+				     struct btrfs_path *path,
+				     struct btrfs_path *dst_path)
+{
+	struct log_args args;
+	struct btrfs_root *log = root->log_root;
+	struct extent_map *em, *n;
+	struct list_head extents;
+	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+	u64 test_gen;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&extents);
+
+	memset(&args, 0, sizeof(args));
+
+	write_lock(&tree->lock);
+	test_gen = root->fs_info->last_trans_committed;
+
+	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+		list_del_init(&em->list);
+		if (em->generation <= test_gen)
+			continue;
+		list_add_tail(&em->list, &extents);
+	}
+
+	list_sort(NULL, &extents, extent_cmp);
+
+	while (!list_empty(&extents)) {
+		em = list_entry(extents.next, struct extent_map, list);
+
+		list_del_init(&em->list);
+
+		/*
+		 * If we had an error we just need to delete everybody from our
+		 * private list.
+		 */
+		if (ret)
+			continue;
+
+		/*
+		 * If the previous EM and the last extent we left off on aren't
+		 * sequential then we need to copy the items we have and redo
+		 * our search
+		 */
+		if (args.nr && em->start != args.next_offset) {
+			ret = copy_items(trans, log, dst_path, args.src,
+					 args.start_slot, args.nr,
+					 LOG_INODE_ALL);
+			if (ret)
+				continue;
+			btrfs_release_path(path);
+			args.nr = 0;
+		}
+
+		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+	}
+
+	if (!ret && args.nr)
+		ret = copy_items(trans, log, dst_path, args.src,
+				 args.start_slot, args.nr, LOG_INODE_ALL);
+	btrfs_release_path(path);
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
+	return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2832,6 +3021,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	int nritems;
 	int ins_start_slot = 0;
 	int ins_nr;
+	bool fast_search = false;
 	u64 ino = btrfs_ino(inode);
 
 	log = root->log_root;
@@ -2851,10 +3041,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	max_key.objectid = ino;
 
-	/* today the code can only do partial logging of directories */
-	if (!S_ISDIR(inode->i_mode))
-	    inode_only = LOG_INODE_ALL;
 
+	/* today the code can only do partial logging of directories */
 	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
@@ -2881,7 +3069,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
 	} else {
-		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			ret = btrfs_truncate_inode_items(trans, log,
+							 inode, 0, 0);
+		} else {
+			fast_search = true;
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
+			ret = drop_objectid_items(trans, log, path, ino,
+						  BTRFS_XATTR_ITEM_KEY);
+		}
 	}
 	if (ret) {
 		err = ret;
@@ -2960,7 +3157,18 @@ next_slot:
 		}
 		ins_nr = 0;
 	}
-	WARN_ON(ins_nr);
+
+	if (fast_search) {
+		btrfs_release_path(path);
+		btrfs_release_path(dst_path);
+		ret = btrfs_log_changed_extents(trans, root, inode, path,
+						dst_path);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	}
+
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(path);
 		btrfs_release_path(dst_path);

From 0fa83cdb1d72a94ea84ab6380747de6ac7cc8753 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 24 Aug 2012 14:48:11 -0400
Subject: [PATCH 030/121] Btrfs: only warn if we hit an error when doing the
 tree logging

I hit this a couple times while working on my fsync patch (all my bugs, not
normal operation), but with my new stuff we could have new errors from cases
I have not encountered, so instead of BUG()'ing we should be WARN()'ing so
that we are notified there is a problem but the user doesn't lose their
data.  We can easily commit the transaction in the case that the tree
logging fails and still be fine, so let's try and be as nice to the user as
possible.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f2ff02c55130..94db438494df 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3346,7 +3346,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOSPC);
+		WARN_ON(ret != -ENOSPC);
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}

From 06d3d22b456c2f87aeb1eb4517eeabb47e21fcc9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 27 Aug 2012 10:52:19 -0600
Subject: [PATCH 031/121] Btrfs: cleanup extents after we finish logging inode

This is based on Josef's "Btrfs: turbo charge fsync".

We should cleanup those extents after we've finished logging inode,
otherwise we may do redundant work on them.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/tree-log.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 94db438494df..58075d711d24 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3167,6 +3167,12 @@ next_slot:
 			err = ret;
 			goto out_unlock;
 		}
+	} else {
+		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_map *em, *n;
+
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+			list_del_init(&em->list);
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {

From ca7e70f59078046db28501519308c2061b0e7a6f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 27 Aug 2012 17:48:15 -0400
Subject: [PATCH 032/121] Btrfs: do not needlessly restart the transaction for
 enospc

We will stop and restart a transaction every time we move to a different leaf
when truncating a file.  This is for enospc reasons, but really we could
probably get away with doing this a little better by actually working until we
hit an ENOSPC.  So add a ->failfast flag to the block_rsv and set it when we do
truncates which will fail as soon as the block rsv runs out of space, and then
at that point we can stop and restart the transaction and refill the block rsv
and carry on.  This will make rm'ing of a file with lots of extents a bit
faster.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/inode.c       | 53 ++++++++++++++----------------------------
 3 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4b81ea3fa1b2..81c772b5dc8e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1034,6 +1034,7 @@ struct btrfs_block_rsv {
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
 	unsigned int full;
+	unsigned int failfast;
 };
 
 /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62fc92f2b9d7..df2f17b0ac5d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6308,7 +6308,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	ret = block_rsv_use_bytes(block_rsv, blocksize);
 	if (!ret)
 		return block_rsv;
-	if (ret) {
+	if (ret && !block_rsv->failfast) {
 		static DEFINE_RATELIMIT_STATE(_rs,
 				DEFAULT_RATELIMIT_INTERVAL,
 				/*DEFAULT_RATELIMIT_BURST*/ 2);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1b99fe8a129d..ca4fa05171ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3448,12 +3448,6 @@ delete:
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot) {
-			if (root->ref_cows &&
-			    BTRFS_I(inode)->location.objectid !=
-						BTRFS_FREE_INO_OBJECTID) {
-				err = -EAGAIN;
-				goto out;
-			}
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
@@ -3826,6 +3820,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 	rsv->size = min_size;
+	rsv->failfast = 1;
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	btrfs_i_size_write(inode, 0);
@@ -3870,7 +3865,7 @@ void btrfs_evict_inode(struct inode *inode)
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -EAGAIN)
+		if (ret != -ENOSPC)
 			break;
 
 		nr = trans->blocks_used;
@@ -6852,6 +6847,7 @@ static int btrfs_truncate(struct inode *inode)
 	if (!rsv)
 		return -ENOMEM;
 	rsv->size = min_size;
+	rsv->failfast = 1;
 
 	/*
 	 * 1 for the truncate slack space
@@ -6905,37 +6901,13 @@ static int btrfs_truncate(struct inode *inode)
 	 * safe.
 	 */
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	trans->block_rsv = rsv;
 
 	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size);
-		if (ret) {
-			/*
-			 * This can only happen with the original transaction we
-			 * started above, every other time we shouldn't have a
-			 * transaction started yet.
-			 */
-			if (ret == -EAGAIN)
-				goto end_trans;
-			err = ret;
-			break;
-		}
-
-		if (!trans) {
-			/* Just need the 1 for updating the inode */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				ret = err = PTR_ERR(trans);
-				trans = NULL;
-				break;
-			}
-		}
-
-		trans->block_rsv = rsv;
-
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
-		if (ret != -EAGAIN) {
+		if (ret != -ENOSPC) {
 			err = ret;
 			break;
 		}
@@ -6946,11 +6918,22 @@ static int btrfs_truncate(struct inode *inode)
 			err = ret;
 			break;
 		}
-end_trans:
+
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
+
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {

From 4e2f84e63dc138eca91e89ccbc34f37732ce58f7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 27 Aug 2012 10:52:20 -0600
Subject: [PATCH 033/121] Btrfs: improve fsync by filtering extents that we
 want

This is based on Josef's "Btrfs: turbo charge fsync".

The above Josef's patch performs very good in random sync write test,
because we won't have too much extents to merge.

However, it does not performs good on the test:
dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync

The reason is when we do sequencial sync write, we need to merge the
current extent just with the previous one, so that we can get accumulated
extents to log:

A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...

So we'll have to flush more and more checksum into log tree, which is the
bottleneck according to my tests.

But we can avoid this by telling fsync the real extents that are needed
to be logged.

With this, I did the above dd sync write test (size=50m),

         w/o (orig)   w/ (josef's)   w/ (this)
SATA      104KB/s       109KB/s       121KB/s
ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/extent_map.c | 20 ++++++++++++++++++++
 fs/btrfs/extent_map.h |  2 ++
 fs/btrfs/inode.c      |  1 +
 fs/btrfs/tree-log.c   |  6 +++---
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1fe82cfc1d93..ac606f076eb7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			if (merge->generation > em->generation) {
+				em->mod_start = em->start;
+				em->mod_len = em->len;
 				em->generation = merge->generation;
 				list_move(&em->list, &tree->modified_extents);
 			}
@@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		if (merge->generation > em->generation) {
+			em->mod_len = em->len;
 			em->generation = merge->generation;
 			list_move(&em->list, &tree->modified_extents);
 		}
@@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 {
 	int ret = 0;
 	struct extent_map *em;
+	bool prealloc = false;
 
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
@@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
 	list_move(&em->list, &tree->modified_extents);
 	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+	}
 
 	try_merge_map(tree, em);
+
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
+
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	}
 	atomic_inc(&em->refs);
 
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
 	try_merge_map(tree, em);
 out:
 	return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 2388a60bd6e3..8e6294b51357 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -20,6 +20,8 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 mod_start;
+	u64 mod_len;
 	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ca4fa05171ab..878116d9625d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1308,6 +1308,7 @@ out_check:
 			em->block_start = disk_bytenr;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 			while (1) {
 				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 58075d711d24..71e71539ffb7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
-	u64 start = em->start;
-	u64 len = em->len;
+	u64 start = em->mod_start;
+	u64 len = em->mod_len;
 	u64 num_bytes;
 	int nritems;
 	int ret;
@@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		 * sequential then we need to copy the items we have and redo
 		 * our search
 		 */
-		if (args.nr && em->start != args.next_offset) {
+		if (args.nr && em->mod_start != args.next_offset) {
 			ret = copy_items(trans, log, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);

From 321f0e70225abc792d74902a2bc4a60164265fd4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 28 Aug 2012 22:13:02 -0600
Subject: [PATCH 034/121] Btrfs: fix wrong orphan count of the fs/file tree

If we add a new orphan item, we should increase the atomic counter,
not decrease it. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 878116d9625d..a6824bd04493 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2228,7 +2228,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 			insert = 1;
 #endif
 		insert = 1;
-		atomic_dec(&root->orphan_inodes);
+		atomic_inc(&root->orphan_inodes);
 	}
 
 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,

From 46d8bc34248f3a94dea910137d1ddf5fb1e3a1cc Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 29 Aug 2012 01:07:55 -0600
Subject: [PATCH 035/121] Btrfs: fix a bug in checking whether a inode is
 already in log

This is based on Josef's "Btrfs: turbo charge fsync".

The current btrfs checks if an inode is in log by comparing
root's last_log_commit to inode's last_sub_trans[2].

But the problem is that this root->last_log_commit is shared among
inodes.

Say we have N inodes to be logged, after the first inode,
root's last_log_commit is updated and the N-1 remained files will
be skipped.

This fixes the bug by keeping a local copy of root's last_log_commit
inside each inode and this local copy will be maintained itself.

[1]: we regard each log transaction as a subset of btrfs's transaction,
i.e. sub_trans

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/btrfs_inode.h | 14 ++++++--------
 fs/btrfs/inode.c       |  2 ++
 fs/btrfs/transaction.h |  1 +
 fs/btrfs/tree-log.c    |  1 +
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7c7bf818f3c1..ed8ca7ca5eff 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -144,6 +144,9 @@ struct btrfs_inode {
 	/* flags field from the on disk inode */
 	u32 flags;
 
+	/* a local copy of root's last_log_commit */
+	unsigned long last_log_commit;
+
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
@@ -203,15 +206,10 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
-
-	mutex_lock(&root->log_mutex);
 	if (BTRFS_I(inode)->logged_trans == generation &&
-	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-		ret = 1;
-	mutex_unlock(&root->log_mutex);
-	return ret;
+	    BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit)
+		return 1;
+	return 0;
 }
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a6824bd04493..24745b8f2745 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6774,6 +6774,7 @@ again:
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
@@ -7018,6 +7019,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->last_unlink_trans = 0;
+	ei->last_log_commit = 0;
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e8b8416c688b..1a138bfb8f13 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -88,6 +88,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 {
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 71e71539ffb7..fc0df95c2862 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3185,6 +3185,7 @@ next_slot:
 		}
 	}
 	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 

From d2794405113cd04f13a58e32e4d541dc87ec86e4 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 29 Aug 2012 01:07:56 -0600
Subject: [PATCH 036/121] Btrfs: check if an inode has no checksum when logging
 it

This is based on Josef's "Btrfs: turbo charge fsync".

If an inode is a BTRFS_INODE_NODATASUM one, we don't need to look for csum
items any more.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/tree-log.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fc0df95c2862..5e2ffb95cc64 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2680,13 +2680,14 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 }
 
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *log,
+			       struct inode *inode,
 			       struct btrfs_path *dst_path,
 			       struct extent_buffer *src,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
 	unsigned long dst_offset;
+	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_inode_item *inode_item;
 	int ret;
@@ -2695,6 +2696,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	char *ins_data;
 	int i;
 	struct list_head ordered_sums;
+	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -2745,7 +2747,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 		 * or deletes of this inode don't have to relog the inode
 		 * again
 		 */
-		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+		    !skip_csum) {
 			int found_type;
 			extent = btrfs_item_ptr(src, start_slot + i,
 						struct btrfs_file_extent_item);
@@ -2873,7 +2876,7 @@ next_slot:
 		    args->start_slot + args->nr == path->slots[0]) {
 			args->nr++;
 		} else if (args->nr) {
-			ret = copy_items(trans, log, dst_path, args->src,
+			ret = copy_items(trans, inode, dst_path, args->src,
 					 args->start_slot, args->nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2910,7 +2913,7 @@ next_slot:
 		}
 
 		if (args->nr) {
-			ret = copy_items(trans, log, dst_path, args->src,
+			ret = copy_items(trans, inode, dst_path, args->src,
 					 args->start_slot, args->nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2930,7 +2933,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_path *dst_path)
 {
 	struct log_args args;
-	struct btrfs_root *log = root->log_root;
 	struct extent_map *em, *n;
 	struct list_head extents;
 	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -2971,7 +2973,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		 * our search
 		 */
 		if (args.nr && em->mod_start != args.next_offset) {
-			ret = copy_items(trans, log, dst_path, args.src,
+			ret = copy_items(trans, inode, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
 			if (ret)
@@ -2984,7 +2986,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 
 	if (!ret && args.nr)
-		ret = copy_items(trans, log, dst_path, args.src,
+		ret = copy_items(trans, inode, dst_path, args.src,
 				 args.start_slot, args.nr, LOG_INODE_ALL);
 	btrfs_release_path(path);
 	WARN_ON(!list_empty(&extents));
@@ -3109,7 +3111,7 @@ again:
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;
@@ -3127,7 +3129,7 @@ next_slot:
 			goto again;
 		}
 		if (ins_nr) {
-			ret = copy_items(trans, log, dst_path, src,
+			ret = copy_items(trans, inode, dst_path, src,
 					 ins_start_slot,
 					 ins_nr, inode_only);
 			if (ret) {
@@ -3148,8 +3150,7 @@ next_slot:
 			break;
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, log, dst_path, src,
-				 ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;

From 2671485d395c07fca104c972785898d7c52fc942 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 29 Aug 2012 12:24:27 -0400
Subject: [PATCH 037/121] Btrfs: remove unused hint byte argument for
 btrfs_drop_extents

I audited all users of btrfs_drop_extents and found that nobody actually uses
the hint_byte argument.  I'm sure it was used for something at some point but
it's not used now, and the way the pinning works the disk bytenr would never be
immediately useful anyway so lets just remove it.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h    |  4 ++--
 fs/btrfs/file.c     | 16 +++++-----------
 fs/btrfs/inode.c    | 12 +++---------
 fs/btrfs/ioctl.c    |  5 ++---
 fs/btrfs/tree-log.c |  7 ++-----
 5 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 81c772b5dc8e..71d6ff13d76e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3323,10 +3323,10 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *hint_byte, int drop_cache);
+			 int drop_cache);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
-		       u64 end, u64 *hint_byte, int drop_cache);
+		       u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 399f9d71a926..58598c249951 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -584,7 +584,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 u64 *hint_byte, int drop_cache)
+			 int drop_cache)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -716,7 +716,6 @@ next_slot:
 						new_key.objectid,
 						start - extent_offset, 0);
 				BUG_ON(ret); /* -ENOMEM */
-				*hint_byte = disk_bytenr;
 			}
 			key.offset = start;
 		}
@@ -736,10 +735,8 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (update_refs && disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, end - key.offset);
-				*hint_byte = disk_bytenr;
-			}
 			break;
 		}
 
@@ -755,10 +752,8 @@ next_slot:
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (update_refs && disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, extent_end - start);
-				*hint_byte = disk_bytenr;
-			}
 			if (end == extent_end)
 				break;
 
@@ -794,7 +789,6 @@ next_slot:
 				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
-				*hint_byte = disk_bytenr;
 			}
 
 			if (end == extent_end)
@@ -834,7 +828,7 @@ next_slot:
 
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
-		       u64 end, u64 *hint_byte, int drop_cache)
+		       u64 end, int drop_cache)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -843,7 +837,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
-				   hint_byte, drop_cache);
+				   drop_cache);
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 24745b8f2745..f0a4792492ed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,7 +230,6 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	u64 inline_len = actual_end - start;
 	u64 aligned_end = (end + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-	u64 hint_byte;
 	u64 data_len = inline_len;
 	int ret;
 
@@ -247,8 +246,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end,
-				 &hint_byte, 1);
+	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
 	if (ret)
 		return ret;
 
@@ -1786,7 +1784,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
-	u64 hint;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1805,8 +1802,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	 * with the others.
 	 */
 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
-				 file_pos + num_bytes,
-				 &hint, 0);
+				 file_pos + num_bytes, 0);
 	if (ret)
 		goto out;
 
@@ -3629,7 +3625,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 		last_byte = (last_byte + mask) & ~mask;
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
 			struct extent_map *hole_em;
-			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
 
 			trans = btrfs_start_transaction(root, 3);
@@ -3640,8 +3635,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 			err = btrfs_drop_extents(trans, root, inode,
 						 cur_offset,
-						 cur_offset + hole_size,
-						 &hint_byte, 1);
+						 cur_offset + hole_size, 1);
 			if (err) {
 				btrfs_abort_transaction(trans, root, err);
 				btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 95223222d5ad..5543fd562b55 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2353,7 +2353,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	int ret;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
-	u64 hint_byte;
 
 	/*
 	 * TODO:
@@ -2579,7 +2578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
+							 1);
 				if (ret) {
 					btrfs_abort_transaction(trans, root,
 								ret);
@@ -2653,7 +2652,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
+							 1);
 				if (ret) {
 					btrfs_abort_transaction(trans, root,
 								ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5e2ffb95cc64..0c39f58973db 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -485,7 +485,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	int found_type;
 	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
-	u64 alloc_hint;
 	u64 start = key->offset;
 	u64 saved_nbytes;
 	struct btrfs_file_extent_item *item;
@@ -551,8 +550,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, root, inode, start, extent_end,
-				 &alloc_hint, 1);
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 	BUG_ON(ret);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -2843,9 +2841,8 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (BTRFS_I(inode)->logged_trans == trans->transid) {
-		u64 tmp;
 		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-					   start + len, &tmp, 0);
+					   start + len, 0);
 		if (ret)
 			return ret;
 	}

From 2aaa66558172b017f36bf38ae69372813dedee9d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 29 Aug 2012 14:27:18 -0400
Subject: [PATCH 038/121] Btrfs: add hole punching

This patch adds hole punching via fallocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h       |   4 +-
 fs/btrfs/extent-tree.c |   2 +
 fs/btrfs/file.c        | 332 ++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/inode.c       |  28 +++-
 fs/btrfs/tree-log.c    |   2 +-
 5 files changed, 355 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 71d6ff13d76e..88adfe638409 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3250,6 +3250,8 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct inode *dir, u64 objectid,
 			const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct inode *inode, u64 new_size,
@@ -3323,7 +3325,7 @@ extern const struct file_operations btrfs_file_operations;
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 int drop_cache);
+			 u64 *drop_end, int drop_cache);
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode, u64 start,
 		       u64 end, int drop_cache);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df2f17b0ac5d..ee51b7afe97d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4132,6 +4132,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
+	if (!rsv)
+		return;
 	btrfs_block_rsv_release(root, rsv, (u64)-1);
 	kfree(rsv);
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58598c249951..57026a6e9c94 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,6 +39,7 @@
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 
 /*
  * when auto defrag is enabled we
@@ -584,7 +585,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct inode *inode,
 			 struct btrfs_path *path, u64 start, u64 end,
-			 int drop_cache)
+			 u64 *drop_end, int drop_cache)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
@@ -822,6 +823,8 @@ next_slot:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 
+	if (drop_end)
+		*drop_end = min(end, extent_end);
 	btrfs_release_path(path);
 	return ret;
 }
@@ -836,7 +839,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	ret = __btrfs_drop_extents(trans, root, inode, path, start, end,
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
 				   drop_cache);
 	btrfs_free_path(path);
 	return ret;
@@ -1645,6 +1648,324 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+			  int slot, u64 start, u64 end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_disk_bytenr(leaf, fi))
+		return 0;
+
+	if (key.offset == end)
+		return 1;
+	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+		return 1;
+	return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+		      struct btrfs_path *path, u64 offset, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct extent_map *hole_em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(!ret);
+
+	leaf = path->nodes[0];
+	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]--;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+			end - offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+
+	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]++;
+		key.offset = offset;
+		btrfs_set_item_key_safe(trans, root, path, &key);
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+			offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, end - offset, 0, end - offset,
+				       0, 0, 0);
+	if (ret)
+		return ret;
+
+out:
+	btrfs_release_path(path);
+
+	hole_em = alloc_extent_map();
+	if (!hole_em) {
+		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+	} else {
+		hole_em->start = offset;
+		hole_em->len = end - offset;
+		hole_em->orig_start = offset;
+
+		hole_em->block_start = EXTENT_MAP_HOLE;
+		hole_em->block_len = 0;
+		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+		hole_em->compress_type = BTRFS_COMPRESS_NONE;
+		hole_em->generation = trans->transid;
+
+		do {
+			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, hole_em);
+			if (!ret)
+				list_move(&hole_em->list,
+					  &em_tree->modified_extents);
+			write_unlock(&em_tree->lock);
+		} while (ret == -EEXIST);
+		free_extent_map(hole_em);
+		if (ret)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	}
+
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *rsv;
+	struct btrfs_trans_handle *trans;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	u64 lockstart = (offset + mask) & ~mask;
+	u64 lockend = ((offset + len) & ~mask) - 1;
+	u64 cur_offset = lockstart;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	u64 drop_end;
+	unsigned long nr;
+	int ret = 0;
+	int err = 0;
+	bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
+		((offset + len) >> PAGE_CACHE_SHIFT);
+
+	btrfs_wait_ordered_range(inode, offset, len);
+
+	mutex_lock(&inode->i_mutex);
+	if (offset >= inode->i_size) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	/*
+	 * Only do this if we are in the same page and we aren't doing the
+	 * entire page.
+	 */
+	if (same_page && len < PAGE_CACHE_SIZE) {
+		ret = btrfs_truncate_page(inode, offset, len, 0);
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	/* zero back part of the first page */
+	ret = btrfs_truncate_page(inode, offset, 0, 0);
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	/* zero the front end of the last page */
+	ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	if (lockend < lockstart) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len < lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, EXTENT_UPTODATE, 0,
+				     cached_state)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_NOFS);
+		btrfs_wait_ordered_range(inode, lockstart,
+					 lockend - lockstart + 1);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	while (cur_offset < lockend) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, lockend + 1,
+					   &drop_end, 1);
+		if (ret != -ENOSPC)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root, nr);
+
+		trans = btrfs_start_transaction(root, 3);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+	}
+
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+out_free:
+	btrfs_free_path(path);
+	btrfs_free_block_rsv(root, rsv);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+	mutex_unlock(&inode->i_mutex);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
@@ -1663,10 +1984,13 @@ static long btrfs_fallocate(struct file *file, int mode,
 	alloc_start = offset & ~mask;
 	alloc_end =  (offset + len + mask) & ~mask;
 
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	/* Make sure we aren't being give some crap mode */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return btrfs_punch_hole(inode, offset, len);
+
 	/*
 	 * Make sure we have enough space before we do the
 	 * allocation.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f0a4792492ed..8cb272c84089 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3475,12 +3475,20 @@ error:
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *	offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front)
 {
-	struct inode *inode = mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -3495,7 +3503,8 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	u64 page_start;
 	u64 page_end;
 
-	if ((offset & (blocksize - 1)) == 0)
+	if ((offset & (blocksize - 1)) == 0 &&
+	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
@@ -3555,8 +3564,13 @@ again:
 
 	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
+		if (!len)
+			len = PAGE_CACHE_SIZE - offset;
 		kaddr = kmap(page);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		if (front)
+			memset(kaddr, 0, offset);
+		else
+			memset(kaddr + offset, 0, len);
 		flush_dcache_page(page);
 		kunmap(page);
 	}
@@ -6796,7 +6810,7 @@ static int btrfs_truncate(struct inode *inode)
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0c39f58973db..f9b0fc911661 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2842,7 +2842,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 
 	if (BTRFS_I(inode)->logged_trans == trans->transid) {
 		ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-					   start + len, 0);
+					   start + len, NULL, 0);
 		if (ret)
 			return ret;
 	}

From 6fc4e3548598d10a5e947797a09cbc1b257a22ab Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:15 -0600
Subject: [PATCH 039/121] Btrfs: pass lockdep rwsem metadata to async commit
 transaction

The freeze rwsem is taken by sb_start_intwrite() and dropped during the
commit_ or end_transaction().  In the async case, that happens in a worker
thread.  Tell lockdep the calling thread is releasing ownership of the
rwsem and the async thread is picking it up.

XFS plays the same trick in fs/xfs/xfs_aops.c.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/btrfs/transaction.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 27c26004e050..730c94590c9f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1229,6 +1229,14 @@ static void do_async_commit(struct work_struct *work)
 	struct btrfs_async_commit *ac =
 		container_of(work, struct btrfs_async_commit, work.work);
 
+	/*
+	 * We've got freeze protection passed with the transaction.
+	 * Tell lockdep about it.
+	 */
+	rwsem_acquire_read(
+		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		0, 1, _THIS_IP_);
+
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
 }
@@ -1258,6 +1266,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	atomic_inc(&cur_trans->use_count);
 
 	btrfs_end_transaction(trans, root);
+
+	/*
+	 * Tell lockdep we've released the freeze rwsem, since the
+	 * async commit thread will be the one to unlock it.
+	 */
+	rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		      1, _THIS_IP_);
+
 	schedule_delayed_work(&ac->work, 0);
 
 	/* wait for transaction to start and unblock */

From e209db7ace281ca347b1ac699bf1fb222eac03fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:16 -0600
Subject: [PATCH 040/121] Btrfs: set journal_info in async trans commit worker

We expect current->journal_info to point to the trans handle we are
committing.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/btrfs/transaction.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 730c94590c9f..ffc6b5202d5c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1237,6 +1237,8 @@ static void do_async_commit(struct work_struct *work)
 		&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
 		0, 1, _THIS_IP_);
 
+	current->journal_info = ac->newtrans;
+
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
 }

From ac14aed66558d686b1f95dac1f07ecfe11d8c30e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 30 Aug 2012 16:26:17 -0600
Subject: [PATCH 041/121] Btrfs: do not take cleanup_work_sem in
 btrfs_run_delayed_iputs()

Josef has suggested that this is not necessary.  Removing it also avoids
this lockdep splat (after the new sb_internal locking stuff was added):

[  604.090449] ======================================================
[  604.114819] [ INFO: possible circular locking dependency detected ]
[  604.139262] 3.6.0-rc2-ceph-00144-g463b030 #1 Not tainted
[  604.162193] -------------------------------------------------------
[  604.186139] btrfs-cleaner/6669 is trying to acquire lock:
[  604.209555]  (sb_internal#2){.+.+..}, at: [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  604.257100]
[  604.257100] but task is already holding lock:
[  604.300366]  (&fs_info->cleanup_work_sem){.+.+..}, at: [<ffffffffa0048002>] btrfs_run_delayed_iputs+0x72/0x130 [btrfs]
[  604.352989]
[  604.352989] which lock already depends on the new lock.
[  604.352989]
[  604.427104]
[  604.427104] the existing dependency chain (in reverse order) is:
[  604.478493]
[  604.478493] -> #1 (&fs_info->cleanup_work_sem){.+.+..}:
[  604.529313]        [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  604.559621]        [<ffffffff81632b69>] down_read+0x39/0x4e
[  604.589382]        [<ffffffffa004db98>] btrfs_lookup_dentry+0x218/0x550 [btrfs]
[  604.596161] btrfs: unlinked 1 orphans
[  604.675002]        [<ffffffffa006aadd>] create_subvol+0x62d/0x690 [btrfs]
[  604.708859]        [<ffffffffa006d666>] btrfs_mksubvol.isra.52+0x346/0x3a0 [btrfs]
[  604.772466]        [<ffffffffa006d7f2>] btrfs_ioctl_snap_create_transid+0x132/0x190 [btrfs]
[  604.842245]        [<ffffffffa006d8ae>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs]
[  604.912852]        [<ffffffffa00708ae>] btrfs_ioctl+0x138e/0x1990 [btrfs]
[  604.951888]        [<ffffffff8118e9b8>] do_vfs_ioctl+0x98/0x560
[  604.989961]        [<ffffffff8118ef11>] sys_ioctl+0x91/0xa0
[  605.026628]        [<ffffffff8163d569>] system_call_fastpath+0x16/0x1b
[  605.064404]
[  605.064404] -> #0 (sb_internal#2){.+.+..}:
[  605.126832]        [<ffffffff810b25e8>] __lock_acquire+0x1ac8/0x1b90
[  605.163671]        [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  605.200228]        [<ffffffff8117dac6>] __sb_start_write+0xc6/0x1b0
[  605.236818]        [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  605.274029]        [<ffffffffa00431a3>] btrfs_start_transaction+0x13/0x20 [btrfs]
[  605.340520]        [<ffffffffa004ccfa>] btrfs_evict_inode+0x19a/0x330 [btrfs]
[  605.378720]        [<ffffffff811972c8>] evict+0xb8/0x1c0
[  605.416057]        [<ffffffff811974d5>] iput+0x105/0x210
[  605.452373]        [<ffffffffa0048082>] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs]
[  605.521627]        [<ffffffffa003b5e1>] cleaner_kthread+0xa1/0x120 [btrfs]
[  605.560520]        [<ffffffff810791ee>] kthread+0xae/0xc0
[  605.598094]        [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
[  605.636499]
[  605.636499] other info that might help us debug this:
[  605.636499]
[  605.736504]  Possible unsafe locking scenario:
[  605.736504]
[  605.801931]        CPU0                    CPU1
[  605.835126]        ----                    ----
[  605.867093]   lock(&fs_info->cleanup_work_sem);
[  605.898594]                                lock(sb_internal#2);
[  605.931954]                                lock(&fs_info->cleanup_work_sem);
[  605.965359]   lock(sb_internal#2);
[  605.994758]
[  605.994758]  *** DEADLOCK ***
[  605.994758]
[  606.075281] 2 locks held by btrfs-cleaner/6669:
[  606.104528]  #0:  (&fs_info->cleaner_mutex){+.+...}, at: [<ffffffffa003b5d5>] cleaner_kthread+0x95/0x120 [btrfs]
[  606.165626]  #1:  (&fs_info->cleanup_work_sem){.+.+..}, at: [<ffffffffa0048002>] btrfs_run_delayed_iputs+0x72/0x130 [btrfs]
[  606.231297]
[  606.231297] stack backtrace:
[  606.287723] Pid: 6669, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00144-g463b030 #1
[  606.347823] Call Trace:
[  606.376184]  [<ffffffff8162a77c>] print_circular_bug+0x1fb/0x20c
[  606.409243]  [<ffffffff810b25e8>] __lock_acquire+0x1ac8/0x1b90
[  606.441343]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.474583]  [<ffffffff810b2c82>] lock_acquire+0xa2/0x140
[  606.505934]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.539429]  [<ffffffff8132babd>] ? do_raw_spin_unlock+0x5d/0xb0
[  606.571719]  [<ffffffff8117dac6>] __sb_start_write+0xc6/0x1b0
[  606.603498]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.637405]  [<ffffffffa0042b84>] ? start_transaction+0x124/0x430 [btrfs]
[  606.670165]  [<ffffffff81172e75>] ? kmem_cache_alloc+0xb5/0x160
[  606.702144]  [<ffffffffa0042b84>] start_transaction+0x124/0x430 [btrfs]
[  606.735562]  [<ffffffffa00256a6>] ? block_rsv_add_bytes+0x56/0x80 [btrfs]
[  606.769861]  [<ffffffffa00431a3>] btrfs_start_transaction+0x13/0x20 [btrfs]
[  606.804575]  [<ffffffffa004ccfa>] btrfs_evict_inode+0x19a/0x330 [btrfs]
[  606.838756]  [<ffffffff81634c6b>] ? _raw_spin_unlock+0x2b/0x40
[  606.872010]  [<ffffffff811972c8>] evict+0xb8/0x1c0
[  606.903800]  [<ffffffff811974d5>] iput+0x105/0x210
[  606.935416]  [<ffffffffa0048082>] btrfs_run_delayed_iputs+0xf2/0x130 [btrfs]
[  606.970510]  [<ffffffffa003b5d5>] ? cleaner_kthread+0x95/0x120 [btrfs]
[  607.005648]  [<ffffffffa003b5e1>] cleaner_kthread+0xa1/0x120 [btrfs]
[  607.040724]  [<ffffffffa003b540>] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs]
[  607.104740]  [<ffffffff810791ee>] kthread+0xae/0xc0
[  607.137119]  [<ffffffff810b379d>] ? trace_hardirqs_on+0xd/0x10
[  607.169797]  [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
[  607.202472]  [<ffffffff81635430>] ? retint_restore_args+0x13/0x13
[  607.235884]  [<ffffffff81079140>] ? flush_kthread_work+0x1a0/0x1a0
[  607.268731]  [<ffffffff8163e740>] ? gs_change+0x13/0x13

Signed-off-by: Sage Weil <sage@inktank.com>
---
 fs/btrfs/inode.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cb272c84089..073af0724bc0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2118,7 +2118,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	if (empty)
 		return;
 
-	down_read(&root->fs_info->cleanup_work_sem);
 	spin_lock(&fs_info->delayed_iput_lock);
 	list_splice_init(&fs_info->delayed_iputs, &list);
 	spin_unlock(&fs_info->delayed_iput_lock);
@@ -2129,7 +2128,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 		iput(delayed->inode);
 		kfree(delayed);
 	}
-	up_read(&root->fs_info->cleanup_work_sem);
 }
 
 enum btrfs_orphan_cleanup_state {

From 7014cdb49305eda0767d2ae6136f8c191ea8fd81 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 30 Aug 2012 20:06:49 -0400
Subject: [PATCH 042/121] Btrfs: btrfs_drop_extent_cache should never fail

I noticed this when I was doing the fsync stuff, we allocate split extents if we
drop an extent range that is in the middle of an existing extent.  This BUG()'s
if we fail to allocate memory, but the fact is this is just a cache, we will
just regenerate the cache if we need it, the important part is that we free the
range we are given.  This can be done without allocations, so if we fail to
allocate splits just skip the splitting stage and free our em and look for more
extents to drop.  This also makes btrfs_drop_extent_cache a void since nobody
was checking the return value anyway.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h |  4 ++--
 fs/btrfs/file.c  | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 88adfe638409..b7cd3adb5a58 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3316,8 +3316,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned);
 int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
 			       u64 start, u64 end, int skip_pinned,
 			       int modified);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 57026a6e9c94..a50e98733e28 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -459,8 +459,8 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			       int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
@@ -479,11 +479,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		testend = 0;
 	}
 	while (1) {
+		int no_splits = 0;
+
 		if (!split)
 			split = alloc_extent_map();
 		if (!split2)
 			split2 = alloc_extent_map();
-		BUG_ON(!split || !split2); /* -ENOMEM */
+		if (!split || !split2)
+			no_splits = 1;
 
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
@@ -509,6 +512,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
+		if (no_splits)
+			goto next;
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
 		    em->start < start) {
@@ -559,6 +564,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(split);
 			split = NULL;
 		}
+next:
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -570,7 +576,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		free_extent_map(split);
 	if (split2)
 		free_extent_map(split2);
-	return 0;
 }
 
 /*

From cf93dccea67ad8f5e0d9163c6a0a584550bbd7cd Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Sun, 2 Sep 2012 07:44:51 -0600
Subject: [PATCH 043/121] Btrfs: fix possible memory leak in
 scrub_setup_recheck_block()

bbio has been malloced in btrfs_map_block() and should be
freed before leaving from the error handling cases.

spatch with a semantic match is used to found this problem.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 fs/btrfs/scrub.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b223620cd5a6..4e9eafe01c55 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1029,6 +1029,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 				spin_lock(&sdev->stat_lock);
 				sdev->stat.malloc_errors++;
 				spin_unlock(&sdev->stat_lock);
+				kfree(bbio);
 				return -ENOMEM;
 			}
 			sblock->page_count++;

From 6fa9700e734275de2acbcb0e99414bd7ddfc60f1 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:00:32 -0600
Subject: [PATCH 044/121] Btrfs: fix error path in create_pending_snapshot()

This patch fixes the following problem:
- If we failed to deal with the delayed dir items, we should abort transaction,
  just as its comment said. Fix it.
- If root reference or root back reference insertion failed, we should
  abort transaction. Fix it.
- Fix the double free problem of pending->inherit.
- Do not restore the trans->rsv if we doesn't change it.
- make the error path more clearly.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/transaction.c | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ffc6b5202d5c..7be031853273 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -967,18 +967,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	uuid_le new_uuid;
 
-	rsv = trans->block_rsv;
-
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		ret = pending->error = -ENOMEM;
-		goto fail;
+		goto root_item_alloc_fail;
 	}
 
 	ret = btrfs_find_free_objectid(tree_root, &objectid);
 	if (ret) {
 		pending->error = ret;
-		goto fail;
+		goto no_free_objectid;
 	}
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
@@ -988,22 +986,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 						  to_reserve);
 		if (ret) {
 			pending->error = ret;
-			goto fail;
+			goto no_free_objectid;
 		}
 	}
 
 	ret = btrfs_qgroup_inherit(trans, fs_info, root->root_key.objectid,
 				   objectid, pending->inherit);
-	kfree(pending->inherit);
 	if (ret) {
 		pending->error = ret;
-		goto fail;
+		goto no_free_objectid;
 	}
 
 	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 
+	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
 
 	dentry = pending->dentry;
@@ -1023,10 +1021,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				BTRFS_FT_DIR, index);
 	if (ret == -EEXIST) {
 		pending->error = -EEXIST;
-		dput(parent);
 		goto fail;
 	} else if (ret) {
-		goto abort_trans_dput;
+		goto abort_trans;
 	}
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
@@ -1034,7 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/*
 	 * pull in the delayed directory update
@@ -1043,10 +1040,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret) { /* Transaction aborted */
-		dput(parent);
-		goto fail;
-	}
+	if (ret)	/* Transaction aborted */
+		goto abort_trans;
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -1079,7 +1074,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret) {
 		btrfs_tree_unlock(old);
 		free_extent_buffer(old);
-		goto abort_trans_dput;
+		goto abort_trans;
 	}
 
 	btrfs_set_lock_blocking(old);
@@ -1089,7 +1084,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
@@ -1102,7 +1097,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
 	if (ret)
-		goto abort_trans_dput;
+		goto abort_trans;
 
 	/*
 	 * insert root back/forward references
@@ -1111,9 +1106,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	dput(parent);
 	if (ret)
-		goto fail;
+		goto abort_trans;
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
@@ -1125,15 +1119,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_reloc_post_snapshot(trans, pending);
 	if (ret)
 		goto abort_trans;
-	ret = 0;
 fail:
-	kfree(new_root_item);
+	dput(parent);
 	trans->block_rsv = rsv;
+no_free_objectid:
+	kfree(new_root_item);
+root_item_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
 
-abort_trans_dput:
-	dput(parent);
 abort_trans:
 	btrfs_abort_transaction(trans, root, ret);
 	goto fail;

From 361048f586f59d414421c6486dd846063a0cac98 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:00:57 -0600
Subject: [PATCH 045/121] Btrfs: fix full backref problem when inserting shared
 block reference

If we create several snapshots at the same time, the following BUG_ON() will be
triggered.

	kernel BUG at fs/btrfs/extent-tree.c:6047!

Steps to reproduce:
 # mkfs.btrfs <partition>
 # mount <partition> <mnt>
 # cd <mnt>
 # for ((i=0;i<2400;i++)); do touch long_name_to_make_tree_more_deep$i; done
 # for ((i=0; i<4; i++))
 > do
 > mkdir $i
 > for ((j=0; j<200; j++))
 > do
 > btrfs sub snap . $i/$j
 > done &
 > done

The reason is:
Before transaction commit, some operations changed the fs tree and new tree
blocks were allocated because of COW. We used the implicit non-shared back
reference for those newly allocated tree blocks because they were not shared by
two or more trees.

And then we created the first snapshot for the fs tree, according to the back
reference rules, we also used implicit back refs for the child tree blocks of
the root node of the fs tree, now those child nodes/leaves were shared by two
trees.

Then We didn't deal with the delayed references, and continued to change the fs
tree(created the second snapshot and inserted the dir item of the new snapshot
into the fs tree). According to the rules of the back reference, we added full
back refs for those tree blocks whose parents have be shared by two trees.
Now some newly allocated tree blocks had two types of the references.

As we know, the delayed reference system handles these delayed references from
back to front, and the full delayed reference is inserted after the implicit
ones. So when we dealt with the back references of those newly allocated tree
blocks, the full references was dealt with at first. And if the first reference
is a shared back reference and the tree block that the reference points to is
newly allocated, It would be considered as a tree block which is shared by two
or more trees when it is allocated and should be a full back reference not a
implicit one, the flag of its reference also should be set to FULL_BACKREF.
But in fact, it was a non-shared tree block with a implicit reference at
beginning, so it was not compulsory to set the flags to FULL_BACKREF. So BUG_ON
was triggered.

We have several methods to fix this bug:
1. deal with delayed references after the snapshot is created and before we
   change the source tree of the snapshot. This is the easiest and safest way.
2. modify the sort method of the delayed reference tree, make the full delayed
   references be inserted before the implicit ones. It is also very easy, but
   I don't know if it will introduce some problems or not.
3. modify select_delayed_ref() and make it select the implicit delayed reference
   at first. This way is not so good because it may wastes CPU time if we have
   lots of delayed references.
4. set the flags to FULL_BACKREF, this method is a little complex comparing with
   the 1st way.

I chose the 1st way to fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/transaction.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7be031853273..4b6ce5cab444 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1119,6 +1119,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_reloc_post_snapshot(trans, pending);
 	if (ret)
 		goto abort_trans;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		goto abort_trans;
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;

From b9a8cc5bef963b76c5b6c3016b7e91988a3e758b Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:01:21 -0600
Subject: [PATCH 046/121] Btrfs: fix file extent discount problem in the,
 snapshot

If a snapshot is created while we are writing some data into the file,
the i_size of the corresponding file in the snapshot will be wrong, it will
be beyond the end of the last file extent. And btrfsck will report:
  root 256 inode 257 errors 100

Steps to reproduce:
 # mkfs.btrfs <partition>
 # mount <partition> <mnt>
 # cd <mnt>
 # dd if=/dev/zero of=tmpfile bs=4M count=1024 &
 # for ((i=0; i<4; i++))
 > do
 > btrfs sub snap . $i
 > done

This because the algorithm of disk_i_size update is wrong. Though there are
some ordered extents behind the current one which we use to update disk_i_size,
it doesn't mean those extents will be dealt with in the same transaction. So
We shouldn't use the offset of those extents to update disk_i_size. Or we will
get the wrong i_size in the snapshot.

We fix this problem by recording the max real i_size. If we find there is a
ordered extent which is in front of the current one and doesn't complete, we
will record the end of the current one into that ordered extent. Surely, if
the current extent holds the end of other extent(it must be greater than
the current one because it is behind the current one), we will record the
number that the current extent holds. In this way, we can exclude the ordered
extents that may not be dealth with in the same transaction, and be easy to
know the real disk_i_size.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ordered-data.c | 62 ++++++++++++-----------------------------
 fs/btrfs/ordered-data.h |  7 +++++
 2 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 051c7fe551dd..cd8ecb73c05c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -775,7 +775,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	u64 disk_i_size;
 	u64 new_i_size;
-	u64 i_size_test;
 	u64 i_size = i_size_read(inode);
 	struct rb_node *node;
 	struct rb_node *prev = NULL;
@@ -835,55 +834,30 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 			break;
 		if (test->file_offset >= i_size)
 			break;
-		if (test->file_offset >= disk_i_size)
+		if (test->file_offset >= disk_i_size) {
+			/*
+			 * we don't update disk_i_size now, so record this
+			 * undealt i_size. Or we will not know the real
+			 * i_size.
+			 */
+			if (test->outstanding_isize < offset)
+				test->outstanding_isize = offset;
+			if (ordered &&
+			    ordered->outstanding_isize >
+			    test->outstanding_isize)
+				test->outstanding_isize =
+						ordered->outstanding_isize;
 			goto out;
+		}
 	}
 	new_i_size = min_t(u64, offset, i_size);
 
 	/*
-	 * at this point, we know we can safely update i_size to at least
-	 * the offset from this ordered extent.  But, we need to
-	 * walk forward and see if ios from higher up in the file have
-	 * finished.
+	 * Some ordered extents may completed before the current one, and
+	 * we hold the real i_size in ->outstanding_isize.
 	 */
-	if (ordered) {
-		node = rb_next(&ordered->rb_node);
-	} else {
-		if (prev)
-			node = rb_next(prev);
-		else
-			node = rb_first(&tree->tree);
-	}
-
-	/*
-	 * We are looking for an area between our current extent and the next
-	 * ordered extent to update the i_size to.  There are 3 cases here
-	 *
-	 * 1) We don't actually have anything and we can update to i_size.
-	 * 2) We have stuff but they already did their i_size update so again we
-	 * can just update to i_size.
-	 * 3) We have an outstanding ordered extent so the most we can update
-	 * our disk_i_size to is the start of the next offset.
-	 */
-	i_size_test = i_size;
-	for (; node; node = rb_next(node)) {
-		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-
-		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
-			continue;
-		if (test->file_offset > offset) {
-			i_size_test = test->file_offset;
-			break;
-		}
-	}
-
-	/*
-	 * i_size_test is the end of a region after this ordered
-	 * extent where there are no ordered extents, we can safely set
-	 * disk_i_size to this.
-	 */
-	if (i_size_test > offset)
-		new_i_size = min_t(u64, i_size_test, i_size);
+	if (ordered && ordered->outstanding_isize > new_i_size)
+		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 	BTRFS_I(inode)->disk_i_size = new_i_size;
 	ret = 0;
 out:
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e03c560d2997..c2443a431ca5 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -96,6 +96,13 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
+	/*
+	 * the end of the ordered extent which is behind it but
+	 * didn't update disk_i_size. Please see the comment of
+	 * btrfs_ordered_update_i_size();
+	 */
+	u64 outstanding_isize;
+
 	/* flags (described above) */
 	unsigned long flags;
 

From 6352b91da1a2108bb8cc5115e8714f90d706f15f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:01:51 -0600
Subject: [PATCH 047/121] Btrfs: use a slab for ordered extents allocation

The ordered extent allocation is in the fast path of the IO, so use a slab
to improve the speed of the allocation.

 "Size of the struct is 280, so this will fall into the size-512 bucket,
  giving 8 objects per page, while own slab will pack 14 objects into a page.

  Another benefit I see is to check for leaked objects when the module is
  removed (and the cache destroy takes place)."
						-- David Sterba

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ordered-data.c | 23 +++++++++++++++++++++--
 fs/btrfs/ordered-data.h |  2 ++
 fs/btrfs/super.c        |  9 ++++++++-
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index cd8ecb73c05c..e2b3d994ec01 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,8 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 
+static struct kmem_cache *btrfs_ordered_extent_cache;
+
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
 	if (entry->file_offset + entry->len < entry->file_offset)
@@ -187,7 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	struct btrfs_ordered_extent *entry;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
@@ -421,7 +423,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 			list_del(&sum->list);
 			kfree(sum);
 		}
-		kfree(entry);
+		kmem_cache_free(btrfs_ordered_extent_cache, entry);
 	}
 }
 
@@ -958,3 +960,20 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&root->fs_info->ordered_extent_lock);
 }
+
+int __init ordered_data_init(void)
+{
+	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+				     sizeof(struct btrfs_ordered_extent), 0,
+				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				     NULL);
+	if (!btrfs_ordered_extent_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ordered_data_exit(void)
+{
+	if (btrfs_ordered_extent_cache)
+		kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c2443a431ca5..d1ddaeff1356 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -192,4 +192,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root,
 				int nocow_only, int delay_iput);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83d6f9f9c220..06ff1dd0f9b7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1620,10 +1620,14 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_extent_io;
 
-	err = btrfs_delayed_inode_init();
+	err = ordered_data_init();
 	if (err)
 		goto free_extent_map;
 
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_ordered_data;
+
 	err = btrfs_interface_init();
 	if (err)
 		goto free_delayed_inode;
@@ -1641,6 +1645,8 @@ unregister_ioctl:
 	btrfs_interface_exit();
 free_delayed_inode:
 	btrfs_delayed_inode_exit();
+free_ordered_data:
+	ordered_data_exit();
 free_extent_map:
 	extent_map_exit();
 free_extent_io:
@@ -1657,6 +1663,7 @@ static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
 	btrfs_delayed_inode_exit();
+	ordered_data_exit();
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();

From 66d8f3dd1c87813d7f1cf8b774cb03e9b8d7e87e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:02:28 -0600
Subject: [PATCH 048/121] Btrfs: add a new "type" field into the block
 reservation structure

Sometimes we need choose the method of the reservation according to the type
of the block reservation, such as the reservation for the delayed inode update.
Now we identify the type just by comparing the address of the reservation
variants, it is very ugly if it is a temporary one because we need compare it
with all the common reservation variants. So we add a new "type" field to keep
the type the reservation variants.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h         | 18 ++++++++++++++----
 fs/btrfs/delayed-inode.c |  4 ++--
 fs/btrfs/disk-io.c       | 15 +++++++++------
 fs/btrfs/extent-tree.c   |  8 +++++---
 fs/btrfs/file.c          |  2 +-
 fs/btrfs/inode.c         |  8 ++++----
 fs/btrfs/ioctl.c         |  3 ++-
 fs/btrfs/relocation.c    |  3 ++-
 8 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7cd3adb5a58..2990a7ea6248 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1028,13 +1028,22 @@ struct btrfs_space_info {
 	wait_queue_head_t wait;
 };
 
+#define	BTRFS_BLOCK_RSV_GLOBAL		1
+#define	BTRFS_BLOCK_RSV_DELALLOC	2
+#define	BTRFS_BLOCK_RSV_TRANS		3
+#define	BTRFS_BLOCK_RSV_CHUNK		4
+#define	BTRFS_BLOCK_RSV_DELOPS		5
+#define	BTRFS_BLOCK_RSV_EMPTY		6
+#define	BTRFS_BLOCK_RSV_TEMP		7
+
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
-	unsigned int full;
-	unsigned int failfast;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
 };
 
 /*
@@ -2875,8 +2884,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 07d5eeb1e6f1..eb768c4c1358 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -650,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 	 * we're accounted for.
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
-	    src_rsv != &root->fs_info->delalloc_block_rsv)) {
+			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
 		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
 		/*
 		 * Since we're under a transaction reserve_metadata_bytes could
@@ -668,7 +668,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 						      num_bytes, 1);
 		}
 		return ret;
-	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
 		spin_lock(&BTRFS_I(inode)->lock);
 		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
 				       &BTRFS_I(inode)->runtime_flags)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22e98e04c2ea..0dcfb998229e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2014,12 +2014,15 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	btrfs_init_block_rsv(&fs_info->global_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ee51b7afe97d..36e03312267a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4108,13 +4108,15 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
 	return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
+	rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4123,7 +4125,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv);
+	btrfs_init_block_rsv(block_rsv, type);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
 	return block_rsv;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a50e98733e28..a9d7815cf58e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1874,7 +1874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		goto out;
 	}
 
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		ret = -ENOMEM;
 		goto out_free;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 073af0724bc0..d34eb329720d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2195,7 +2195,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	int ret;
 
 	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(root);
+		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 		if (!block_rsv)
 			return -ENOMEM;
 	}
@@ -3070,7 +3070,7 @@ out:
 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
-	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+	if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
 		btrfs_block_rsv_release(root, trans->block_rsv,
 					trans->bytes_reserved);
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3821,7 +3821,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
@@ -6851,7 +6851,7 @@ static int btrfs_truncate(struct inode *inode)
 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
 	 * updating the inode.
 	 */
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
 		return -ENOMEM;
 	rsv->size = min_size;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5543fd562b55..e6934de55a8b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -516,7 +516,8 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!pending_snapshot)
 		return -ENOMEM;
 
-	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+			     BTRFS_BLOCK_RSV_TEMP);
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4da08652004d..5a15c96a18ff 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3674,7 +3674,8 @@ int prepare_to_relocate(struct reloc_control *rc)
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+					      BTRFS_BLOCK_RSV_TEMP);
 	if (!rc->block_rsv)
 		return -ENOMEM;
 

From 42874b3db7817f662b1d7c6e32f8b63638fa0321 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:03:32 -0600
Subject: [PATCH 049/121] Btrfs: fix the snapshot that should not exist

The snapshot should be the image of the fs tree before it was created,
so the metadata of the snapshot should not exist in the its tree. But now, we
found the directory item and directory name index is in both the snapshot tree
and the fs tree. It introduces some problems and makes the users feel strange:

 # mkfs.btrfs /dev/sda1
 # mount /dev/sda1 /mnt
 # mkdir /mnt/1
 # cd /mnt/1
 # btrfs subvolume snapshot /mnt snap0
 # ls -a /mnt/1/snap0/1
 .	..	[no other file/dir]

 # ll /mnt/1/snap0/
 total 0
 drwxr-xr-x 1 root root 10 Ju1 24 12:11 1
			^^^
			There is no file/dir in it, but it's size is 10

 # cd /mnt/1/snap0/1/snap0
 [Enter a unexisted directory successfully...]

There is nothing in the directory 1 in snap0, but btrfs told the length of
this directory is 10. Beside that, we can enter an unexisted directory, it is
very strange to the users.

 # btrfs subvolume snapshot /mnt/1/snap0 /mnt/snap1
 # ll /mnt/1/snap0/1/
 total 0
 [None]
 # ll /mnt/snap1/1/
 total 0
 drwxr-xr-x 1 root root 0 Ju1 24 12:14 snap0

And the source of snap1 did have any directory in Directory 1, but snap1 have
a snap0, it is different between the source and the snapshot.

So I think we should insert directory item and directory name index and update
the parent inode as the last step of snapshot creation, and do not leave the
useless metadata in the file tree.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/transaction.c | 68 ++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4b6ce5cab444..a86fc723aad9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -955,6 +955,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *parent_root;
 	struct btrfs_block_rsv *rsv;
 	struct inode *parent_inode;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
 	struct dentry *parent;
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
@@ -967,6 +969,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	uuid_le new_uuid;
 
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = pending->error = -ENOMEM;
+		goto path_alloc_fail;
+	}
+
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		ret = pending->error = -ENOMEM;
@@ -1015,23 +1023,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 */
 	ret = btrfs_set_inode_index(parent_inode, &index);
 	BUG_ON(ret); /* -ENOMEM */
-	ret = btrfs_insert_dir_item(trans, parent_root,
-				dentry->d_name.name, dentry->d_name.len,
-				parent_inode, &key,
-				BTRFS_FT_DIR, index);
-	if (ret == -EEXIST) {
+
+	/* check if there is a file/dir which has the same name. */
+	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+					 btrfs_ino(parent_inode),
+					 dentry->d_name.name,
+					 dentry->d_name.len, 0);
+	if (dir_item != NULL && !IS_ERR(dir_item)) {
 		pending->error = -EEXIST;
 		goto fail;
-	} else if (ret) {
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
 		goto abort_trans;
 	}
-
-	btrfs_i_size_write(parent_inode, parent_inode->i_size +
-					 dentry->d_name.len * 2);
-	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
-	if (ret)
-		goto abort_trans;
+	btrfs_release_path(path);
 
 	/*
 	 * pull in the delayed directory update
@@ -1123,12 +1128,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	if (ret)
 		goto abort_trans;
+
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    parent_inode, &key,
+				    BTRFS_FT_DIR, index);
+	/* We have check then name at the beginning, so it is impossible. */
+	BUG_ON(ret == -EEXIST);
+	if (ret)
+		goto abort_trans;
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	if (ret)
+		goto abort_trans;
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;
 no_free_objectid:
 	kfree(new_root_item);
 root_item_alloc_fail:
+	btrfs_free_path(path);
+path_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
 
@@ -1472,13 +1495,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	mutex_lock(&root->fs_info->reloc_mutex);
 
-	ret = btrfs_run_delayed_items(trans, root);
+	/*
+	 * We needn't worry about the delayed items because we will
+	 * deal with them in create_pending_snapshot(), which is the
+	 * core function of the snapshot creation.
+	 */
+	ret = create_pending_snapshots(trans, root->fs_info);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;
 	}
 
-	ret = create_pending_snapshots(trans, root->fs_info);
+	/*
+	 * We insert the dir indexes of the snapshots and update the inode
+	 * of the snapshots' parents after the snapshot creation, so there
+	 * are some delayed items which are not dealt with. Now deal with
+	 * them.
+	 *
+	 * We needn't worry that this operation will corrupt the snapshots,
+	 * because all the tree which are snapshoted will be forced to COW
+	 * the nodes and leaves.
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;

From 48c03c4bcfd7a1fcb1e05e9b1db1188cdbecf49a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:03:56 -0600
Subject: [PATCH 050/121] Btrfs: fix wrong size for the reservation of the,
 snapshot creation

We should insert/update 6 items(root ref, root backref, dir item, dir index,
root item and parent inode) when creating a snapshot, not 5 items, fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 6 +++---
 fs/btrfs/ioctl.c       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 36e03312267a..b6b33e463ac3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4414,10 +4414,10 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
 	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
 	/*
-	 * two for root back/forward refs, two for directory entries
-	 * and one for root of the snapshot.
+	 * two for root back/forward refs, two for directory entries,
+	 * one for root of the snapshot and one for parent inode.
 	 */
-	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
+	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
 	dst_rsv->space_info = src_rsv->space_info;
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e6934de55a8b..c56daa368768 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -526,7 +526,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 		*inherit = NULL;	/* take responsibility to free it */
 	}
 
-	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;

From 2ecb79239bcd04c9d410f4cdce16adb6840b19da Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:27 -0600
Subject: [PATCH 051/121] Btrfs: fix unprotected ->log_batch

We forget to protect ->log_batch when syncing a file, this patch fix
this problem by atomic operation. And ->log_batch is used to check
if there are parallel sync operations or not, so it is unnecessary to
reset it to 0 after the sync operation of the current log tree complete.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.h    |  2 +-
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/file.c     |  4 ++--
 fs/btrfs/tree-log.c | 12 +++++-------
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2990a7ea6248..6923b9e4f90d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1491,9 +1491,9 @@ struct btrfs_root {
 	wait_queue_head_t log_commit_wait[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
+	atomic_t log_batch;
 	unsigned long log_transid;
 	unsigned long last_log_commit;
-	unsigned long log_batch;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0dcfb998229e..34717ac2416a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1168,8 +1168,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
+	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
-	root->log_batch = 0;
 	root->log_transid = 0;
 	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a9d7815cf58e..793bc89c660f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1551,9 +1551,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * ordered range does a filemape_write_and_wait_range which is why we
 	 * don't do it above like other file systems.
 	 */
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	btrfs_wait_ordered_range(inode, start, end);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 
 	/*
 	 * check the transaction that last modified this inode
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f9b0fc911661..038a5229404a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -147,7 +147,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			root->log_multiple_pids = true;
 		}
 
-		root->log_batch++;
+		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
 		mutex_unlock(&root->log_mutex);
 		return 0;
@@ -166,7 +166,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			err = ret;
 	}
 	mutex_unlock(&root->fs_info->tree_log_mutex);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
 	return err;
@@ -2036,7 +2036,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 	while (1) {
-		unsigned long batch = root->log_batch;
+		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
@@ -2044,7 +2044,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->log_mutex);
 		}
 		wait_for_writer(trans, root);
-		if (batch == root->log_batch)
+		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
 
@@ -2073,7 +2073,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
-	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
@@ -2086,7 +2085,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 	mutex_lock(&log_root_tree->log_mutex);
-	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
 	mutex_unlock(&log_root_tree->log_mutex);
 
@@ -2156,7 +2155,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
-	log_root_tree->log_batch = 0;
 	log_root_tree->log_transid++;
 	smp_mb();
 

From 69ce977a179750915e04fcc12bfbe33e6c8f5132 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:44 -0600
Subject: [PATCH 052/121] Btrfs: output more information when aborting a unused
 transaction handle

Though we dump the stack information when aborting a unused transaction
handle, we don't know the correct place where we decide to abort the
transaction handle if one function has several place where the transaction
abort function is invoked and jumps to the same place after this call.
And beside that we also don't know the reason why we jump to abort
the current handle. So I modify the transaction abort function and make
it output the function name, line and error information.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/super.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 06ff1dd0f9b7..867e8e799dea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -248,7 +248,13 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
 	if (!trans->blocks_used) {
-		btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+		char nbuf[16];
+		const char *errstr;
+
+		errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+		btrfs_printk(root->fs_info,
+			     "%s:%d: Aborting unused transaction(%s).\n",
+			     function, line, errstr);
 		return;
 	}
 	trans->transaction->aborted = errno;

From 903889f462409c816893abd02d88636f7b4a7774 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Sep 2012 04:04:57 -0600
Subject: [PATCH 053/121] Btrfs: fix wrong size for the reservation when doing,
 file pre-allocation.

When we ran fsstress(a program in xfstests), the filesystem hung up when it
is full. It was because the space reserved in btrfs_fallocate() was wrong,
btrfs_fallocate() just used the size of the pre-allocation to reserve the
space, didn't took the block size aligning into account, so the size of
the reserved space was less than the allocated space, it caused the over
reserve problem and made the filesystem hung up when invoking cow_file_range().
Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 793bc89c660f..00279c9a7f35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2000,7 +2000,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * Make sure we have enough space before we do the
 	 * allocation.
 	 */
-	ret = btrfs_check_data_free_space(inode, len);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
 	if (ret)
 		return ret;
 
@@ -2107,7 +2107,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 out:
 	mutex_unlock(&inode->i_mutex);
 	/* Let go of our reservation. */
-	btrfs_free_reserved_data_space(inode, len);
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
 	return ret;
 }
 

From f54fb859da53f04a443c5e3f4cb9b936ed42d227 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Sep 2012 03:08:59 -0600
Subject: [PATCH 054/121] Btrfs: fix error handling in
 delete_block_group_cache()

btrfs_iget() never return NULL.
So, NULL check is unnecessary.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/relocation.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5a15c96a18ff..7e7fd1bcfc54 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3270,8 +3270,8 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 	key.offset = 0;
 
 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-	if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
-		if (inode && !IS_ERR(inode))
+	if (IS_ERR(inode) || is_bad_inode(inode)) {
+		if (!IS_ERR(inode))
 			iput(inode);
 		return -ENOENT;
 	}

From 3d6b5c3b5c0b970ce8a9d3bac6854f5c0ce0295a Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Sep 2012 00:18:10 -0600
Subject: [PATCH 055/121] Btrfs: check return value of ulist_alloc() properly

ulist_alloc() has the possibility of returning NULL.
So, it is necessary to check the return value.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/qgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9b707ba5c6c4..5039686df6ae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1469,6 +1469,10 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 	 * be exceeded
 	 */
 	ulist = ulist_alloc(GFP_ATOMIC);
+	if (!ulist) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {
@@ -1541,6 +1545,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
 		goto out;
 
 	ulist = ulist_alloc(GFP_ATOMIC);
+	if (!ulist) {
+		btrfs_std_error(fs_info, -ENOMEM);
+		goto out;
+	}
 	ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
 	ULIST_ITER_INIT(&uiter);
 	while ((unode = ulist_next(ulist, &uiter))) {

From 9e8a4a8b0b9484e8d14674fc62c9ad8ac9dbce5b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 5 Sep 2012 19:10:51 -0600
Subject: [PATCH 056/121] Btrfs: use flag EXTENT_DEFRAG for snapshot-aware
 defrag

We're going to use this flag EXTENT_DEFRAG to indicate which range
belongs to defragment so that we can implement snapshow-aware defrag:

We set the EXTENT_DEFRAG flag when dirtying the extents that need
defragmented, so later on writeback thread can differentiate between
normal writeback and writeback started by defragmentation.

Original-Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/extent_io.c |  8 ++++++++
 fs/btrfs/extent_io.h |  2 ++
 fs/btrfs/file.c      |  4 ++--
 fs/btrfs/inode.c     | 20 ++++++++++++--------
 fs/btrfs/ioctl.c     |  8 ++++----
 5 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 19319f5a91a8..4a41b17295dc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1144,6 +1144,14 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			      NULL, cached_state, mask);
 }
 
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
+}
+
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 25900af5b15d..512f8da041f1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -235,6 +235,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 00279c9a7f35..0a4b03d8fcd6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1203,8 +1203,8 @@ again:
 
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-				  GFP_NOFS);
+				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				  0, 0, &cached_state, GFP_NOFS);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 				     start_pos, last_pos - 1, &cached_state,
 				     GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d34eb329720d..9b21cf97cdd5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3549,7 +3549,8 @@ again:
 	}
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6061,7 +6062,8 @@ unlock:
 	if (lockstart < lockend) {
 		if (create && len < lockend - lockstart) {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockstart + len - 1, unlock_bits, 1, 0,
+					 lockstart + len - 1,
+					 unlock_bits | EXTENT_DEFRAG, 1, 0,
 					 &cached_state, GFP_NOFS);
 			/*
 			 * Beside unlock, we also need to cleanup reserved space
@@ -6069,8 +6071,8 @@ unlock:
 			 */
 			clear_extent_bit(&BTRFS_I(inode)->io_tree,
 					 lockstart + len, lockend,
-					 unlock_bits | EXTENT_DO_ACCOUNTING,
-					 1, 0, NULL, GFP_NOFS);
+					 unlock_bits | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
 		} else {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
 					 lockend, unlock_bits, 1, 0,
@@ -6635,8 +6637,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 &cached_state, GFP_NOFS);
+				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -6652,7 +6654,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+		 &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -6749,7 +6752,8 @@ again:
 	 * prepare_pages in the normal write path.
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c56daa368768..de886ac7ba0f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1023,8 +1023,8 @@ again:
 			 page_start, page_end - 1, 0, &cached_state);
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-			  GFP_NOFS);
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+			  &cached_state, GFP_NOFS);
 
 	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
@@ -1035,8 +1035,8 @@ again:
 	}
 
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
-				  &cached_state);
+	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+			  &cached_state, GFP_NOFS);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 			     page_start, page_end - 1, &cached_state,

From dea31f52337c18f19eadfbbccb0c477942dad495 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 6 Sep 2012 16:47:00 -0400
Subject: [PATCH 057/121] Btrfs: wait on async pages when shrinking delalloc

Mitch reported a problem where you could get an ENOSPC error when untarring
a kernel git tree onto a 16gb file system with compress-force=zlib.  This is
because compression is a huge pain, it will return from ->writepages()
without having actually created any ordered extents.  To get around this we
check to see if the async submit counter is up, and if it is wait until it
drops to 0 before doing our normal ordered wait dance.  With this patch I
can now untar a kernel git tree onto a 16gb file system without getting
ENOSPC errors.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b6b33e463ac3..06369d588f27 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3697,6 +3697,13 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
 					       WB_REASON_FS_FREE_SPACE);
 
+		/*
+		 * We need to wait for the async pages to actually start before
+		 * we do anything.
+		 */
+		wait_event(root->fs_info->async_submit_wait,
+			   !atomic_read(&root->fs_info->async_delalloc_pages));
+
 		spin_lock(&space_info->lock);
 		if (space_info->bytes_used + space_info->bytes_reserved +
 		    space_info->bytes_pinned + space_info->bytes_readonly +

From a80c8dcf7e5065adc555ef8ffb256df11e3293e3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 6 Sep 2012 16:59:33 -0400
Subject: [PATCH 058/121] Btrfs: fix our overcommit math

I noticed I was seeing large lags when running my torrent test in a vm on my
laptop.  While trying to make it lag less I noticed that our overcommit math
was taking into account the number of bytes we wanted to reclaim, not the
number of bytes we actually wanted to allocate, which means we wouldn't
overcommit as often.  This patch fixes the overcommit math and makes
shrink_delalloc() use that logic so that it will stop looping faster.  We
still have pretty high spikes of latency, but the test now takes 3 minutes
less time (about 5% faster).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 06369d588f27..a8de1c30f21c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3663,6 +3663,46 @@ out:
 	return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  int flush)
+{
+	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 avail;
+	u64 used;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	avail = root->fs_info->free_chunk_space;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually useable.
+	 */
+	if (profile & (BTRFS_BLOCK_GROUP_DUP |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID10))
+		avail >>= 1;
+
+	/*
+	 * If we aren't flushing don't let us overcommit too much, say
+	 * 1/8th of the space.  If we can flush, let it overcommit up to
+	 * 1/2 of the space.
+	 */
+	if (flush)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3705,10 +3745,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 			   !atomic_read(&root->fs_info->async_delalloc_pages));
 
 		spin_lock(&space_info->lock);
-		if (space_info->bytes_used + space_info->bytes_reserved +
-		    space_info->bytes_pinned + space_info->bytes_readonly +
-		    space_info->bytes_may_use + orig <=
-		    space_info->total_bytes) {
+		if (can_overcommit(root, space_info, orig, !trans)) {
 			spin_unlock(&space_info->lock);
 			break;
 		}
@@ -3924,7 +3961,6 @@ again:
 	}
 
 	if (ret) {
-		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
 		/*
@@ -3945,30 +3981,7 @@ again:
 			goto again;
 		}
 
-		spin_lock(&root->fs_info->free_chunk_lock);
-		avail = root->fs_info->free_chunk_space;
-
-		/*
-		 * If we have dup, raid1 or raid10 then only half of the free
-		 * space is actually useable.
-		 */
-		if (profile & (BTRFS_BLOCK_GROUP_DUP |
-			       BTRFS_BLOCK_GROUP_RAID1 |
-			       BTRFS_BLOCK_GROUP_RAID10))
-			avail >>= 1;
-
-		/*
-		 * If we aren't flushing don't let us overcommit too much, say
-		 * 1/8th of the space.  If we can flush, let it overcommit up to
-		 * 1/2 of the space.
-		 */
-		if (flush)
-			avail >>= 3;
-		else
-			avail >>= 1;
-		 spin_unlock(&root->fs_info->free_chunk_lock);
-
-		if (used + num_bytes < space_info->total_bytes + avail) {
+		if (can_overcommit(root, space_info, orig_bytes, flush)) {
 			space_info->bytes_may_use += orig_bytes;
 			trace_btrfs_space_reservation(root->fs_info,
 				"space_info", space_info->flags, orig_bytes, 1);

From 837e197283199de640857192ca32767cb6e24fe8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 7 Sep 2012 03:00:48 -0600
Subject: [PATCH 059/121] btrfs: polish names of kmem caches

Usecase:

  watch 'grep btrfs < /proc/slabinfo'

easy to watch all caches in one go.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/delayed-inode.c |  2 +-
 fs/btrfs/extent_io.c     |  4 ++--
 fs/btrfs/extent_map.c    |  2 +-
 fs/btrfs/inode.c         | 10 +++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index eb768c4c1358..b26d2f9d88d7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -29,7 +29,7 @@ static struct kmem_cache *delayed_node_cache;
 
 int __init btrfs_delayed_inode_init(void)
 {
-	delayed_node_cache = kmem_cache_create("delayed_node",
+	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
 					sizeof(struct btrfs_delayed_node),
 					0,
 					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4a41b17295dc..3ad84f500687 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,13 +64,13 @@ tree_fs_info(struct extent_io_tree *tree)
 
 int __init extent_io_init(void)
 {
-	extent_state_cache = kmem_cache_create("extent_state",
+	extent_state_cache = kmem_cache_create("btrfs_extent_state",
 			sizeof(struct extent_state), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = kmem_cache_create("extent_buffers",
+	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 			sizeof(struct extent_buffer), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ac606f076eb7..8d1364d385d6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,7 +11,7 @@ static struct kmem_cache *extent_map_cache;
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = kmem_cache_create("extent_map",
+	extent_map_cache = kmem_cache_create("btrfs_extent_map",
 			sizeof(struct extent_map), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9b21cf97cdd5..aae55625aa59 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7155,31 +7155,31 @@ void btrfs_destroy_cachep(void)
 
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
 			sizeof(struct btrfs_trans_handle), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
 
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
 			sizeof(struct btrfs_transaction), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
 
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
 
-	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
 			sizeof(struct btrfs_free_space), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_free_space_cachep)

From 8407aa464331556e4f6784f974030b83fc7585ed Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 7 Sep 2012 01:43:32 -0600
Subject: [PATCH 060/121] Btrfs: fix corrupted metadata in the snapshot

When we delete a inode, we will remove all the delayed items including delayed
inode update, and then truncate all the relative metadata. If there is lots of
metadata, we will end the current transaction, and start a new transaction to
truncate the left metadata. In this way, we will leave a inode item that its
link counter is > 0, and also may leave some directory index items in fs/file tree
after the current transaction ends. In other words, the metadata in this fs/file tree
is inconsistent. If we create a snapshot for this tree now, we will find a inode with
corrupted metadata in the new snapshot, and we won't continue to drop the left metadata,
because its link counter is not 0.

We fix this problem by updating the inode item before the current transaction ends.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/inode.c       | 19 +++++++++----------
 fs/btrfs/transaction.c | 29 +++++++++++++++++++++--------
 fs/btrfs/transaction.h |  2 ++
 3 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index aae55625aa59..9e9754adf7a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3834,15 +3834,10 @@ void btrfs_evict_inode(struct inode *inode)
 	btrfs_i_size_write(inode, 0);
 
 	/*
-	 * This is a bit simpler than btrfs_truncate since
-	 *
-	 * 1) We've already reserved our space for our orphan item in the
-	 *    unlink.
-	 * 2) We're going to delete the inode item, so we don't need to update
-	 *    it at all.
-	 *
-	 * So we just need to reserve some slack space in case we add bytes when
-	 * doing the truncate.
+	 * This is a bit simpler than btrfs_truncate since we've already
+	 * reserved our space for our orphan item in the unlink, so we just
+	 * need to reserve some slack space in case we add bytes and update
+	 * inode item when doing the truncate.
 	 */
 	while (1) {
 		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
@@ -3863,7 +3858,7 @@ void btrfs_evict_inode(struct inode *inode)
 			goto no_delete;
 		}
 
-		trans = btrfs_start_transaction(root, 0);
+		trans = btrfs_start_transaction_noflush(root, 1);
 		if (IS_ERR(trans)) {
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
@@ -3876,6 +3871,10 @@ void btrfs_evict_inode(struct inode *inode)
 		if (ret != -ENOSPC)
 			break;
 
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a86fc723aad9..f8ae448ebec4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -290,7 +290,8 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 }
 
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-						    u64 num_items, int type)
+						    u64 num_items, int type,
+						    int noflush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
@@ -324,9 +325,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		}
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-		ret = btrfs_block_rsv_add(root,
-					  &root->fs_info->trans_block_rsv,
-					  num_bytes);
+		if (noflush)
+			ret = btrfs_block_rsv_add_noflush(root,
+						&root->fs_info->trans_block_rsv,
+						num_bytes);
+		else
+			ret = btrfs_block_rsv_add(root,
+						&root->fs_info->trans_block_rsv,
+						num_bytes);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -393,21 +399,28 @@ got_it:
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START);
+	return start_transaction(root, num_items, TRANS_START, 0);
 }
+
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+					struct btrfs_root *root, int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START, 1);
+}
+
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN);
+	return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_USERSPACE);
+	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1a138bfb8f13..1b054800eca3 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -97,6 +97,8 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);

From 0647d6bd16c03c928a22d1b90c4c8684c5908f08 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Fri, 7 Sep 2012 20:01:26 -0600
Subject: [PATCH 061/121] Btrfs: cleanup for unused ref cache stuff

As ref cache has been removed from btrfs, there is no user on
its lock and its check.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h   | 3 ---
 fs/btrfs/disk-io.c | 5 -----
 2 files changed, 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6923b9e4f90d..305002b7bf3a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1376,9 +1376,6 @@ struct btrfs_fs_info {
 	struct rb_root defrag_inodes;
 	atomic_t defrag_running;
 
-	spinlock_t ref_cache_lock;
-	u64 total_ref_cache_size;
-
 	/*
 	 * these three are in extended format (availability of single
 	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34717ac2416a..8054f7ccf46a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2000,7 +2000,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->trans_lock);
-	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
@@ -3214,10 +3213,6 @@ int close_ctree(struct btrfs_root *root)
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       (unsigned long long)fs_info->delalloc_bytes);
 	}
-	if (fs_info->total_ref_cache_size) {
-		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-		       (unsigned long long)fs_info->total_ref_cache_size);
-	}
 
 	free_extent_buffer(fs_info->extent_root->node);
 	free_extent_buffer(fs_info->extent_root->commit_root);

From dea7d76ecbfb53cda6aadd9bed33e87d255c5b02 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:27 -0600
Subject: [PATCH 062/121] Btrfs: update delayed ref's tracepoints to show
 sequence

We've added a new field 'sequence' to delayed ref node, so update related
tracepoints.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 include/trace/events/btrfs.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 91b91e805673..54fab041b22a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -445,6 +445,7 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
 		__field(	u64,  ref_root		)
 		__field(	int,  level		)
 		__field(	int,  type		)
+		__field(	u64,  seq		)
 	),
 
 	TP_fast_assign(
@@ -455,17 +456,19 @@ TRACE_EVENT(btrfs_delayed_tree_ref,
 		__entry->ref_root	= full_ref->root;
 		__entry->level		= full_ref->level;
 		__entry->type		= ref->type;
+		__entry->seq		= ref->seq;
 	),
 
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), level = %d, "
-		  "type = %s",
+		  "type = %s, seq = %llu",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
 		  show_root_type(__entry->parent),
 		  show_root_type(__entry->ref_root),
-		  __entry->level, show_ref_type(__entry->type))
+		  __entry->level, show_ref_type(__entry->type),
+		  (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_data_ref,
@@ -485,6 +488,7 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		__field(	u64,  owner		)
 		__field(	u64,  offset		)
 		__field(	int,  type		)
+		__field(	u64,  seq		)
 	),
 
 	TP_fast_assign(
@@ -496,11 +500,12 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		__entry->owner		= full_ref->objectid;
 		__entry->offset		= full_ref->offset;
 		__entry->type		= ref->type;
+		__entry->seq		= ref->seq;
 	),
 
 	TP_printk("bytenr = %llu, num_bytes = %llu, action = %s, "
 		  "parent = %llu(%s), ref_root = %llu(%s), owner = %llu, "
-		  "offset = %llu, type = %s",
+		  "offset = %llu, type = %s, seq = %llu",
 		  (unsigned long long)__entry->bytenr,
 		  (unsigned long long)__entry->num_bytes,
 		  show_ref_action(__entry->action),
@@ -508,7 +513,8 @@ TRACE_EVENT(btrfs_delayed_data_ref,
 		  show_root_type(__entry->ref_root),
 		  (unsigned long long)__entry->owner,
 		  (unsigned long long)__entry->offset,
-		  show_ref_type(__entry->type))
+		  show_ref_type(__entry->type),
+		  (unsigned long long)__entry->seq)
 );
 
 TRACE_EVENT(btrfs_delayed_ref_head,

From 69917e431210f8712fe050f47b7561e7dae89521 Mon Sep 17 00:00:00 2001
From: Liu Bo <liub.liubo@gmail.com>
Date: Fri, 7 Sep 2012 20:01:28 -0600
Subject: [PATCH 063/121] Btrfs: fix a bug in parsing return value in logical
 resolve

In logical resolve, we parse extent_from_logical()'s 'ret' as a kind of flag.

It is possible to lose our errors because
(-EXXXX & BTRFS_EXTENT_FLAG_TREE_BLOCK) is true.

I'm not sure if it is on purpose, it just looks too hacky if it is.
I'd rather use a real flag and a 'ret' to catch errors.

Acked-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Liu Bo <liub.liubo@gmail.com>
---
 fs/btrfs/backref.c | 24 ++++++++++++++++--------
 fs/btrfs/backref.h |  3 ++-
 fs/btrfs/ioctl.c   |  6 ++++--
 fs/btrfs/scrub.c   | 16 +++++++++-------
 fs/btrfs/send.c    |  7 ++++---
 5 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e600857d3ca4..ab286e5bfe33 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1193,7 +1193,8 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
  * tree blocks and <0 on error.
  */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key)
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags_ret)
 {
 	int ret;
 	u64 flags;
@@ -1237,10 +1238,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 		 (unsigned long long)found_key->objectid,
 		 (unsigned long long)found_key->offset,
 		 (unsigned long long)flags, item_size);
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
-	if (flags & BTRFS_EXTENT_FLAG_DATA)
-		return BTRFS_EXTENT_FLAG_DATA;
+
+	WARN_ON(!flags_ret);
+	if (flags_ret) {
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+		else if (flags & BTRFS_EXTENT_FLAG_DATA)
+			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
+		else
+			BUG_ON(1);
+		return 0;
+	}
 
 	return -EIO;
 }
@@ -1433,15 +1441,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 {
 	int ret;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_key found_key;
 	int search_commit_root = path->search_commit_root;
 
-	ret = extent_from_logical(fs_info, logical, path,
-					&found_key);
+	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
 	btrfs_release_path(path);
 	if (ret < 0)
 		return ret;
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return -EINVAL;
 
 	extent_item_pos = logical - found_key.objectid;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 032f4dc7eab8..4fda5d8029a7 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,7 +40,8 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path);
 
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key);
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags);
 
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 				struct btrfs_extent_item *ei, u32 item_size,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index de886ac7ba0f..48ff3f7317b0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3211,6 +3211,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	int ret = 0;
 	int size;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
@@ -3240,10 +3241,11 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+	ret = extent_from_logical(root->fs_info, loi->logical, path, &key,
+				  &flags);
 	btrfs_release_path(path);
 
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4e9eafe01c55..d3bb901ade42 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -352,13 +352,14 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
-	u32 item_size;
-	int ret;
-	u64 ref_root;
-	u8 ref_level;
 	unsigned long ptr = 0;
-	const int bufsize = 4096;
 	u64 extent_item_pos;
+	u64 flags = 0;
+	u64 ref_root;
+	u32 item_size;
+	u8 ref_level;
+	const int bufsize = 4096;
+	int ret;
 
 	path = btrfs_alloc_path();
 
@@ -375,7 +376,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 		goto out;
 
-	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+				  &flags);
 	if (ret < 0)
 		goto out;
 
@@ -387,7 +389,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 	btrfs_release_path(path);
 
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
 			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 							&ref_root, &ref_level);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e5c867996aa0..c6ef070a8dca 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1152,6 +1152,7 @@ static int find_extent_clone(struct send_ctx *sctx,
 	u64 disk_byte;
 	u64 num_bytes;
 	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *eb = path->nodes[0];
 	struct backref_ctx *backref_ctx = NULL;
@@ -1198,13 +1199,13 @@ static int find_extent_clone(struct send_ctx *sctx,
 	}
 	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
 
-	ret = extent_from_logical(sctx->send_root->fs_info,
-			disk_byte, tmp_path, &found_key);
+	ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
+				  &found_key, &flags);
 	btrfs_release_path(tmp_path);
 
 	if (ret < 0)
 		goto out;
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		ret = -EIO;
 		goto out;
 	}

From df031f0752d50f2061df2847d57ea52a79f7977c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:29 -0600
Subject: [PATCH 064/121] Btrfs: use helper for logical resolve

We already have a helper, iterate_inodes_from_logical(), for logical resolve,
so just use it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ioctl.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 48ff3f7317b0..fa1284d596ea 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3210,12 +3210,9 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
 	int ret = 0;
 	int size;
-	u64 extent_item_pos;
-	u64 flags = 0;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -3241,23 +3238,13 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	ret = extent_from_logical(root->fs_info, loi->logical, path, &key,
-				  &flags);
-	btrfs_release_path(path);
-
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+					  build_ino_list, inodes);
+	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_item_pos = loi->logical - key.objectid;
-	ret = iterate_extent_inodes(root->fs_info, key.objectid,
-					extent_item_pos, 0, build_ino_list,
-					inodes);
-
-	if (ret < 0)
-		goto out;
-
 	ret = copy_to_user((void *)(unsigned long)loi->inodes,
 			   (void *)(unsigned long)inodes, size);
 	if (ret)

From 425d17a290c0c63785ec65db154a95c6337aeefa Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 7 Sep 2012 20:01:30 -0600
Subject: [PATCH 065/121] Btrfs: use larger limit for translation of logical to
 inode

This is the change of the kernel side.

Translation of logical to inode used to have an upper limit 4k on
inode container's size, but the limit is not large enough for a data
with a great many of refs, so when resolving logical address,
we can end up with
"ioctl ret=0, bytes_left=0, bytes_missing=19944, cnt=510, missed=2493"

This changes to regard 64k as the upper limit and use vmalloc instead of
kmalloc to get memory more easily.

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/backref.c | 5 +++--
 fs/btrfs/ioctl.c   | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ab286e5bfe33..7084431b7c9c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
@@ -1584,7 +1585,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
 	size_t alloc_bytes;
 
 	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
-	data = kmalloc(alloc_bytes, GFP_NOFS);
+	data = vmalloc(alloc_bytes);
 	if (!data)
 		return ERR_PTR(-ENOMEM);
 
@@ -1635,6 +1636,6 @@ void free_ipath(struct inode_fs_paths *ipath)
 {
 	if (!ipath)
 		return;
-	kfree(ipath->fspath);
+	vfree(ipath->fspath);
 	kfree(ipath);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1284d596ea..38e3d6ab89d3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3230,7 +3230,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	size = min_t(u32, loi->size, 4096);
+	size = min_t(u32, loi->size, 64 * 1024);
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);
@@ -3252,7 +3252,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 
 out:
 	btrfs_free_path(path);
-	kfree(inodes);
+	vfree(inodes);
 	kfree(loi);
 
 	return ret;

From 6df7881a84013f91405e5e113a4c322dd1804ba6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 5 Sep 2012 08:08:30 -0600
Subject: [PATCH 066/121] Btrfs: move the sb_end_intwrite until after the
 throttle logic

Sage reported the following lockdep backtrace

=====================================
[ BUG: bad unlock balance detected! ]
3.6.0-rc2-ceph-00171-gc7ed62d #1 Not tainted
-------------------------------------
btrfs-cleaner/7607 is trying to release lock (sb_internal) at:
[<ffffffffa00422ae>] btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
but there are no more locks to release!

other info that might help us debug this:
1 lock held by btrfs-cleaner/7607:
 #0:  (&fs_info->cleaner_mutex){+.+...}, at: [<ffffffffa003b405>] cleaner_kthread+0x95/0x120 [btrfs]

stack backtrace:
Pid: 7607, comm: btrfs-cleaner Not tainted 3.6.0-rc2-ceph-00171-gc7ed62d #1
Call Trace:
 [<ffffffffa00422ae>] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff810afa9e>] print_unlock_inbalance_bug+0xfe/0x110
 [<ffffffff810b289e>] lock_release_non_nested+0x1ee/0x310
 [<ffffffff81172f9b>] ? kmem_cache_free+0x7b/0x160
 [<ffffffffa004106c>] ? put_transaction+0x8c/0x130 [btrfs]
 [<ffffffffa00422ae>] ? btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff810b2a95>] lock_release+0xd5/0x220
 [<ffffffff81173071>] ? kmem_cache_free+0x151/0x160
 [<ffffffff8117d9ed>] __sb_end_write+0x7d/0x90
 [<ffffffffa00422ae>] btrfs_commit_transaction+0xa6e/0xb20 [btrfs]
 [<ffffffff81079850>] ? __init_waitqueue_head+0x60/0x60
 [<ffffffff81634c6b>] ? _raw_spin_unlock+0x2b/0x40
 [<ffffffffa0042758>] __btrfs_end_transaction+0x368/0x3c0 [btrfs]
 [<ffffffffa0042808>] btrfs_end_transaction_throttle+0x18/0x20 [btrfs]
 [<ffffffffa00318f0>] btrfs_drop_snapshot+0x410/0x600 [btrfs]
 [<ffffffff8132babd>] ? do_raw_spin_unlock+0x5d/0xb0
 [<ffffffffa00430ef>] btrfs_clean_old_snapshots+0xaf/0x150 [btrfs]
 [<ffffffffa003b405>] ? cleaner_kthread+0x95/0x120 [btrfs]
 [<ffffffffa003b419>] cleaner_kthread+0xa9/0x120 [btrfs]
 [<ffffffffa003b370>] ? btrfs_destroy_delayed_refs.isra.102+0x220/0x220 [btrfs]
 [<ffffffff810791ee>] kthread+0xae/0xc0
 [<ffffffff810b379d>] ? trace_hardirqs_on+0xd/0x10
 [<ffffffff8163e744>] kernel_thread_helper+0x4/0x10
 [<ffffffff81635430>] ? retint_restore_args+0x13/0x13
 [<ffffffff81079140>] ? flush_kthread_work+0x1a0/0x1a0
 [<ffffffff8163e740>] ? gs_change+0x13/0x13

This is because the throttle stuff can commit the transaction, which expects to
be the one stopping the intwrite stuff, but we've already done it in the
__btrfs_end_transaction.  Moving the sb_end_intewrite after this logic makes the
lockdep go away.  Thanks,

Tested-by: Sage Weil <sage@inktank.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/transaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f8ae448ebec4..0629edf99100 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,8 +564,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
-	sb_end_intwrite(root->fs_info->sb);
-
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -586,6 +584,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	sb_end_intwrite(root->fs_info->sb);
+
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
 	atomic_dec(&cur_trans->num_writers);

From 69ffb54347a71c4f2a4694e0c1155af8538d55f6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 11 Sep 2012 15:40:07 -0400
Subject: [PATCH 067/121] Btrfs: create a pinned em when writing to a prealloc
 range in DIO

Wade Cline reported a problem where he was getting garbage and warnings when
writing to a preallocated range via O_DIRECT.  This is because we weren't
creating our normal pinned extent_map for the range we were writing to,
which was causing all sorts of issues.  This patch fixes the problem and
makes his testcase much happier.  Thanks,

Reported-by: Wade Cline <clinew@linux.vnet.ibm.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e9754adf7a4..406666cb6156 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5898,6 +5898,48 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 	return ret;
 }
 
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   int type)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	em = alloc_extent_map();
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	em->start = start;
+	em->orig_start = orig_start;
+	em->len = len;
+	em->block_len = block_len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	if (type == BTRFS_ORDERED_PREALLOC)
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+
+	do {
+		btrfs_drop_extent_cache(inode, em->start,
+				em->start + em->len - 1, 0);
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		write_unlock(&em_tree->lock);
+	} while (ret == -EEXIST);
+
+	if (ret) {
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -6012,6 +6054,19 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 			goto must_cow;
 
 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
+			u64 orig_start = em->start;
+
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				em = create_pinned_em(inode, start, len,
+						       orig_start,
+						       block_start, len, type);
+				if (IS_ERR(em)) {
+					btrfs_end_transaction(trans, root);
+					goto unlock_err;
+				}
+			}
+
 			ret = btrfs_add_ordered_extent_dio(inode, start,
 					   block_start, len, len, type);
 			btrfs_end_transaction(trans, root);

From 962197babeccc1f4cc8aa28ad844df80bdc85ed0 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 11 Sep 2012 23:27:35 -0600
Subject: [PATCH 068/121] Btrfs: fix unnecessary warning when the fragments
 make the space alloc fail

When we wrote some data by compress mode into a btrfs filesystem which was full
of the fragments, the kernel will report:
	BTRFS warning (device xxx): Aborting unused transaction.

The reason is:
We can not find a long enough free space to store the compressed data because
of the fragmentary free space, and the compressed data can not be splited,
so the kernel outputed the above message.

In fact, btrfs can deal with this problem very well: it fall back to
uncompressed IO, split the uncompressed data into small ones, and then
store them into to the fragmentary free space. So we shouldn't output the
above warning message.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 406666cb6156..d41e467b4aa2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -662,7 +662,7 @@ retry:
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
 					   0, alloc_hint, &ins, 1);
-			if (ret)
+			if (ret && ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
 			btrfs_end_transaction(trans, root);
 		}

From be3940c0a90265654d778394cafe2e2cec674df8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 11 Sep 2012 14:23:05 -0600
Subject: [PATCH 069/121] btrfs: Kill some bi_idx references

For immutable bio vecs, I've been auditing and removing bi_idx
references. These were harmless, but removing them will make auditing
easier.

scrub_bio_end_io_worker() was open coding a bio_reset() - but this
doesn't appear to have been needed for anything as right after it does a
bio_put(), and perusing the code it doesn't appear anything else was
holding a reference to the bio.

The other use end_bio_extent_readpage() was just for a pr_debug() -
changed it to something that might be a bit more useful.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Chris Mason <chris.mason@oracle.com>
CC: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/extent_io.c |  4 ++--
 fs/btrfs/scrub.c     | 15 ---------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3ad84f500687..90bd9f768c0a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2306,8 +2306,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 
-		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
-			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+			 "mirror=%ld\n", (u64)bio->bi_sector, err,
 			 (long int)bio->bi_bdev);
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index d3bb901ade42..27892f67e69b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1669,21 +1669,6 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 		scrub_block_put(sblock);
 	}
 
-	if (sbio->err) {
-		/* what is this good for??? */
-		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-		sbio->bio->bi_phys_segments = 0;
-		sbio->bio->bi_idx = 0;
-
-		for (i = 0; i < sbio->page_count; i++) {
-			struct bio_vec *bi;
-			bi = &sbio->bio->bi_io_vec[i];
-			bi->bv_offset = 0;
-			bi->bv_len = PAGE_SIZE;
-		}
-	}
-
 	bio_put(sbio->bio);
 	sbio->bio = NULL;
 	spin_lock(&sdev->list_lock);

From ea658badc47e614e38ab4d98510488474c7e6d4b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 11 Sep 2012 16:57:25 -0400
Subject: [PATCH 070/121] Btrfs: delay block group item insertion

So we have lots of places where we try to preallocate chunks in order to
make sure we have enough space as we make our allocations.  This has
historically meant that we're constantly tweaking when we should allocate a
new chunk, and historically we have gotten this horribly wrong so we way
over allocate either metadata or data.  To try and keep this from happening
we are going to make it so that the block group item insertion is done out
of band at the end of a transaction.  This will allow us to create chunks
even if we are trying to make an allocation for the extent tree.  With this
patch my enospc tests run faster (didn't expect this) and more efficiently
use the disk space (this is what I wanted).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/ctree.h       |   5 ++
 fs/btrfs/extent-tree.c | 130 ++++++++++++++++++++---------------------
 fs/btrfs/transaction.c |  10 ++++
 fs/btrfs/transaction.h |   1 +
 4 files changed, 79 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 305002b7bf3a..d66dc1c6a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1137,6 +1137,9 @@ struct btrfs_block_group_cache {
 	 * Today it will only have one thing on it, but that may change
 	 */
 	struct list_head cluster_list;
+
+	/* For delayed block group creation */
+	struct list_head new_bg_list;
 };
 
 /* delayed seq elem */
@@ -2865,6 +2868,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a8de1c30f21c..254fd3289f4c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2361,10 +2361,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		}
 
 next:
-		do_chunk_alloc(trans, fs_info->extent_root,
-			       2 * 1024 * 1024,
-			       btrfs_get_alloc_profile(root, 0),
-			       CHUNK_ALLOC_NO_FORCE);
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -2478,10 +2474,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
-	do_chunk_alloc(trans, root->fs_info->extent_root,
-		       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
-		       CHUNK_ALLOC_NO_FORCE);
-
 	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -2551,6 +2543,12 @@ again:
 	}
 
 	if (run_all) {
+		if (!list_empty(&trans->new_bgs)) {
+			spin_unlock(&delayed_refs->lock);
+			btrfs_create_pending_block_groups(trans, root);
+			spin_lock(&delayed_refs->lock);
+		}
+
 		node = rb_first(&delayed_refs->root);
 		if (!node)
 			goto out;
@@ -3826,7 +3824,8 @@ enum flush_state {
 	FLUSH_DELALLOC_WAIT	=	2,
 	FLUSH_DELAYED_ITEMS_NR	=	3,
 	FLUSH_DELAYED_ITEMS	=	4,
-	COMMIT_TRANS		=	5,
+	ALLOC_CHUNK		=	5,
+	COMMIT_TRANS		=	6,
 };
 
 static int flush_space(struct btrfs_root *root,
@@ -3863,6 +3862,20 @@ static int flush_space(struct btrfs_root *root,
 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
 		btrfs_end_transaction(trans, root);
 		break;
+	case ALLOC_CHUNK:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes,
+				     btrfs_get_alloc_profile(root, 0),
+				     CHUNK_ALLOC_NO_FORCE);
+		btrfs_end_transaction(trans, root);
+		if (ret == -ENOSPC)
+			ret = 0;
+		break;
 	case COMMIT_TRANS:
 		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
 		break;
@@ -5515,8 +5528,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *used_block_group;
 	u64 search_start = 0;
 	int empty_cluster = 2 * 1024 * 1024;
-	int allowed_chunk_alloc = 0;
-	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
 	int index = 0;
@@ -5548,9 +5559,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	if (btrfs_mixed_space_info(space_info))
 		use_cluster = false;
 
-	if (orig_root->ref_cows || empty_size)
-		allowed_chunk_alloc = 1;
-
 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
@@ -5860,34 +5868,18 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
-		       if (allowed_chunk_alloc) {
-				ret = do_chunk_alloc(trans, root, num_bytes +
-						     2 * 1024 * 1024, data,
-						     CHUNK_ALLOC_LIMITED);
-				/*
-				 * Do not bail out on ENOSPC since we
-				 * can do more things.
-				 */
-				if (ret < 0 && ret != -ENOSPC) {
-					btrfs_abort_transaction(trans,
-								root, ret);
-					goto out;
-				}
-				allowed_chunk_alloc = 0;
-				if (ret == 1)
-					done_chunk_alloc = 1;
-			} else if (!done_chunk_alloc &&
-				   space_info->force_alloc ==
-				   CHUNK_ALLOC_NO_FORCE) {
-				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+			ret = do_chunk_alloc(trans, root, num_bytes +
+					     2 * 1024 * 1024, data,
+					     CHUNK_ALLOC_FORCE);
+			/*
+			 * Do not bail out on ENOSPC since we
+			 * can do more things.
+			 */
+			if (ret < 0 && ret != -ENOSPC) {
+				btrfs_abort_transaction(trans,
+							root, ret);
+				goto out;
 			}
-
-		       /*
-			* We didn't allocate a chunk, go ahead and drop the
-			* empty size and loop again.
-			*/
-		       if (!done_chunk_alloc)
-			       loop = LOOP_NO_EMPTY_SIZE;
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5962,20 +5954,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
 	data = btrfs_get_alloc_profile(root, data);
 again:
-	/*
-	 * the only place that sets empty_size is btrfs_realloc_node, which
-	 * is not called recursively on allocations
-	 */
-	if (empty_size || root->ref_cows) {
-		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data,
-				     CHUNK_ALLOC_NO_FORCE);
-		if (ret < 0 && ret != -ENOSPC) {
-			btrfs_abort_transaction(trans, root, ret);
-			return ret;
-		}
-	}
-
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       hint_byte, ins, data);
@@ -5985,12 +5963,6 @@ again:
 			num_bytes = num_bytes >> 1;
 			num_bytes = num_bytes & ~(root->sectorsize - 1);
 			num_bytes = max(num_bytes, min_alloc_size);
-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				       num_bytes, data, CHUNK_ALLOC_FORCE);
-			if (ret < 0 && ret != -ENOSPC) {
-				btrfs_abort_transaction(trans, root, ret);
-				return ret;
-			}
 			if (num_bytes == min_alloc_size)
 				final_tried = true;
 			goto again;
@@ -7828,6 +7800,34 @@ error:
 	return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+				 new_bg_list) {
+		list_del_init(&block_group->new_bg_list);
+
+		if (ret)
+			continue;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+	}
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7861,6 +7861,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	spin_lock_init(&cache->lock);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
 
 	btrfs_init_free_space_ctl(cache);
 
@@ -7892,12 +7893,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	BUG_ON(ret); /* Logic error */
 
-	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-				sizeof(cache->item));
-	if (ret) {
-		btrfs_abort_transaction(trans, extent_root, ret);
-		return ret;
-	}
+	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0629edf99100..c01dec70c960 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -374,6 +374,7 @@ again:
 	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
 	if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -549,6 +550,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		trans->qgroup_reserved = 0;
 	}
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	while (count < 2) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
@@ -564,6 +568,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -1400,6 +1407,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	cur_trans->delayed_refs.flushing = 1;
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	if (ret)
 		goto cleanup_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1b054800eca3..b6463e128048 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -68,6 +68,7 @@ struct btrfs_trans_handle {
 	struct btrfs_root *root;
 	struct seq_list delayed_ref_elem;
 	struct list_head qgroup_ref_list;
+	struct list_head new_bgs;
 };
 
 struct btrfs_pending_snapshot {

From 698d0082c4875a2ccc10b52ee8f415faad46b754 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 12 Sep 2012 14:08:47 -0400
Subject: [PATCH 071/121] Btrfs: remove bytes argument from do_chunk_alloc

Everybody is just making stuff up, and it's just used to see if we really do
need to alloc a chunk, and since we do this when we already know we really
do it's just a waste of space.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 254fd3289f4c..0bc83b3f33db 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -94,8 +94,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 				     u64 flags, struct btrfs_disk_key *key,
 				     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force);
+			  struct btrfs_root *extent_root, u64 flags,
+			  int force);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -3404,7 +3404,6 @@ alloc:
 				return PTR_ERR(trans);
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     bytes + 2 * 1024 * 1024,
 					     alloc_target,
 					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
@@ -3486,8 +3485,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
 }
 
 static int should_alloc_chunk(struct btrfs_root *root,
-			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
-			      int force)
+			      struct btrfs_space_info *sinfo, int force)
 {
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3518,7 +3516,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 			return 1;
 	}
 
-	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
+	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
@@ -3568,8 +3566,7 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force)
+			  struct btrfs_root *extent_root, u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
@@ -3593,7 +3590,7 @@ again:
 		return 0;
 	}
 
-	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+	if (!should_alloc_chunk(extent_root, space_info, force)) {
 		spin_unlock(&space_info->lock);
 		return 0;
 	} else if (space_info->chunk_alloc) {
@@ -3869,7 +3866,6 @@ static int flush_space(struct btrfs_root *root,
 			break;
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes,
 				     btrfs_get_alloc_profile(root, 0),
 				     CHUNK_ALLOC_NO_FORCE);
 		btrfs_end_transaction(trans, root);
@@ -5868,8 +5864,7 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
-			ret = do_chunk_alloc(trans, root, num_bytes +
-					     2 * 1024 * 1024, data,
+			ret = do_chunk_alloc(trans, root, data,
 					     CHUNK_ALLOC_FORCE);
 			/*
 			 * Do not bail out on ENOSPC since we
@@ -7269,7 +7264,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 
 	alloc_flags = update_block_group_flags(root, cache->flags);
 	if (alloc_flags != cache->flags) {
-		ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+		ret = do_chunk_alloc(trans, root, alloc_flags,
 				     CHUNK_ALLOC_FORCE);
 		if (ret < 0)
 			goto out;
@@ -7279,7 +7274,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 	if (!ret)
 		goto out;
 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	ret = do_chunk_alloc(trans, root, alloc_flags,
 			     CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
@@ -7293,7 +7288,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 type)
 {
 	u64 alloc_flags = get_alloc_profile(root, type);
-	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	return do_chunk_alloc(trans, root, alloc_flags,
 			      CHUNK_ALLOC_FORCE);
 }
 

From 90abccf2c6e6e9c5a5d519eaed95292afa30aa11 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 13 Sep 2012 04:53:47 -0600
Subject: [PATCH 072/121] Revert "Btrfs: do not do filemap_write_and_wait_range
 in fsync"

This reverts commit 0885ef5b5601e9b007c383e77c172769b1f214fd

After applying the above patch, the performance slowed down because the dirty
page flush can only be done by one task, so revert it.

The following is the test result of sysbench:
	Before		After
	24MB/s		39MB/s

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/file.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0a4b03d8fcd6..d0fc4c5aaf15 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1544,12 +1544,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	trace_btrfs_sync_file(file, datasync);
 
+	/*
+	 * We write the dirty pages in the range and wait until they complete
+	 * out of the ->i_mutex. If so, we can flush the dirty pages by
+	 * multi-task, and make the performance up.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
+
 	mutex_lock(&inode->i_mutex);
 
 	/*
-	 * we wait first, since the writeback may change the inode, also wait
-	 * ordered range does a filemape_write_and_wait_range which is why we
-	 * don't do it above like other file systems.
+	 * We flush the dirty pages again to avoid some dirty pages in the
+	 * range being left.
 	 */
 	atomic_inc(&root->log_batch);
 	btrfs_wait_ordered_range(inode, start, end);

From 0433f20d436fd040013e5a419a2e3d732d618a41 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:32 -0600
Subject: [PATCH 073/121] Btrfs: cleanup of error processing in
 btree_get_extent()

This patch simplifies a little complex error processing in
btree_get_extent().

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/disk-io.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8054f7ccf46a..8db87bc53d27 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -222,21 +222,17 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em) {
-			ret = 0;
-		} else {
-			em = lookup_extent_mapping(em_tree, failed_start,
-						   failed_len);
-			ret = -EIO;
+		if (!em) {
+			lookup_extent_mapping(em_tree, failed_start,
+					      failed_len);
+			em = ERR_PTR(-EIO);
 		}
 	} else if (ret) {
 		free_extent_map(em);
-		em = NULL;
+		em = ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	if (ret)
-		em = ERR_PTR(ret);
 out:
 	return em;
 }

From b4f359ab065b9c6c132124557d23e90e2e8504be Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 13 Sep 2012 03:32:54 -0600
Subject: [PATCH 074/121] Btrfs: remove unnecessary code in btree_get_extent()

Unnecessary lookup_extent_mapping() is removed because an error is
returned to the caller.
This patch was made based on the advice from Stefan Behrens, thanks.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/disk-io.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8db87bc53d27..8d633e3e3919 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -217,16 +217,10 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 	write_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
-		u64 failed_start = em->start;
-		u64 failed_len = em->len;
-
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em) {
-			lookup_extent_mapping(em_tree, failed_start,
-					      failed_len);
+		if (!em)
 			em = ERR_PTR(-EIO);
-		}
 	} else if (ret) {
 		free_extent_map(em);
 		em = ERR_PTR(ret);

From 1bcea35597693b3ac1ec1b311cfd42d52972a710 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Fri, 14 Sep 2012 00:04:21 -0600
Subject: [PATCH 075/121] Btrfs: write_buf is now callable outside send.c

Developing service cmds needs it.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/send.c | 11 ++++++-----
 fs/btrfs/send.h |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c6ef070a8dca..c7beb543a4a8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -386,7 +386,7 @@ static struct btrfs_path *alloc_path_for_send(void)
 	return path;
 }
 
-static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 {
 	int ret;
 	mm_segment_t old_fs;
@@ -396,8 +396,7 @@ static int write_buf(struct send_ctx *sctx, const void *buf, u32 len)
 	set_fs(KERNEL_DS);
 
 	while (pos < len) {
-		ret = vfs_write(sctx->send_filp, (char *)buf + pos, len - pos,
-				&sctx->send_off);
+		ret = vfs_write(filp, (char *)buf + pos, len - pos, off);
 		/* TODO handle that correctly */
 		/*if (ret == -ERESTARTSYS) {
 			continue;
@@ -553,7 +552,8 @@ static int send_header(struct send_ctx *sctx)
 	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
 	hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
 
-	return write_buf(sctx, &hdr, sizeof(hdr));
+	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
+					&sctx->send_off);
 }
 
 /*
@@ -590,7 +590,8 @@ static int send_cmd(struct send_ctx *sctx)
 	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 	hdr->crc = cpu_to_le32(crc);
 
-	ret = write_buf(sctx, sctx->send_buf, sctx->send_size);
+	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
+					&sctx->send_off);
 
 	sctx->total_send_size += sctx->send_size;
 	sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9934e948e57f..1bf4f32fd4ef 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -130,4 +130,5 @@ enum {
 
 #ifdef __KERNEL__
 long btrfs_ioctl_send(struct file *mnt_file, void __user *arg);
+int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off);
 #endif

From ebb3dad4353b94c12a5cffab4397727c19f088e5 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 13 Sep 2012 20:29:02 -0600
Subject: [PATCH 076/121] Btrfs: using for_each_set_bit_from to simplify the
 code

Using for_each_set_bit_from() to simplify the code.

spatch with a semantic match is used to found this.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 fs/btrfs/free-space-cache.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6b10acfc2f5c..b107e68797f4 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1454,9 +1454,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			  max_t(u64, *offset, bitmap_info->offset));
 	bits = bytes_to_bits(*bytes, ctl->unit);
 
-	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
 		if ((next_zero - i) >= bits) {
@@ -2307,9 +2305,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 
 again:
 	found_bits = 0;
-	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
 		if (next_zero - i >= min_bits) {

From b3ae244e7174d981c09ad7a6a68e7909d600aaca Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 13 Sep 2012 16:04:34 -0600
Subject: [PATCH 077/121] btrfs: return EPERM upon rmdir on a subvolume

A subvolume cannot be deleted via rmdir, but the error code ENOTEMPTY
is confusing. Return EPERM instead, as this is not permitted.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d41e467b4aa2..1c50f7c4f5ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3200,9 +3200,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -EPERM;
 
 	trans = __unlink_start_trans(dir, dentry);
 	if (IS_ERR(trans))

From 60376ce4a8396bc5cd777be05b6a9bf044520f42 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 10:34:40 -0400
Subject: [PATCH 078/121] Btrfs: fix race in sync and freeze again

I screwed this up, there is a race between checking if there is a running
transaction and actually starting a transaction in sync where we could race
with a freezer and get ourselves into trouble.  To fix this we need to make
a new join type to only do the try lock on the freeze stuff.  If it fails
we'll return EPERM and just return from sync.  This fixes a hang Liu Bo
reported when running xfstest 68 in a loop.  Thanks,

Reported-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/super.c       | 15 ++++++---------
 fs/btrfs/transaction.c | 12 +++++++++++-
 fs/btrfs/transaction.h |  1 +
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 867e8e799dea..903ab2d7068a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -854,16 +854,13 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0, 0);
 
-	spin_lock(&fs_info->trans_lock);
-	if (!fs_info->running_transaction) {
-		spin_unlock(&fs_info->trans_lock);
-		return 0;
-	}
-	spin_unlock(&fs_info->trans_lock);
-
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
+	trans = btrfs_join_transaction_freeze(root);
+	if (IS_ERR(trans)) {
+		/* Frozen, don't bother */
+		if (PTR_ERR(trans) == -EPERM)
+			return 0;
 		return PTR_ERR(trans);
+	}
 	return btrfs_commit_transaction(trans, root);
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c01dec70c960..e4bfac8d54b8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -272,6 +272,7 @@ enum btrfs_trans_type {
 	TRANS_JOIN,
 	TRANS_USERSPACE,
 	TRANS_JOIN_NOLOCK,
+	TRANS_JOIN_FREEZE,
 };
 
 static int may_wait_transaction(struct btrfs_root *root, int type)
@@ -341,7 +342,11 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	sb_start_intwrite(root->fs_info->sb);
+	if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
+		if (type == TRANS_JOIN_FREEZE)
+			return ERR_PTR(-EPERM);
+		sb_start_intwrite(root->fs_info->sb);
+	}
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
@@ -424,6 +429,11 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
+struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0);
+}
+
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b6463e128048..fbf8313b9d67 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,

From ab26e9d6c8a760aa5c4518eb2cb1a85516b3a4f7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:05 -0600
Subject: [PATCH 079/121] Btrfs: cleanup for duplicated code in
 find_free_extent

There is already an 'add free space' phrase in front of this one, we
needn't to redo it.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0bc83b3f33db..27a6b3e6fa41 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5828,10 +5828,6 @@ checks:
 
 		trace_btrfs_reserve_extent(orig_root, block_group,
 					   search_start, num_bytes);
-		if (offset < search_start)
-			btrfs_add_free_space(used_block_group, offset,
-					     search_start - offset);
-		BUG_ON(offset > search_start);
 		if (used_block_group != block_group)
 			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);

From 2e90cf858fd2639eae2fb3f2173f7633cb15bf38 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:06 -0600
Subject: [PATCH 080/121] Btrfs: cleanup fs_info->hashers

fs_info->hashers is now an obsolete one.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ctree.h   | 1 -
 fs/btrfs/disk-io.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d66dc1c6a40d..1372057e1ec1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1253,7 +1253,6 @@ struct btrfs_fs_info {
 	struct mutex reloc_mutex;
 
 	struct list_head trans_list;
-	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d633e3e3919..5d3f813e13a0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1984,7 +1984,6 @@ int open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	INIT_LIST_HEAD(&fs_info->ordered_operations);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);

From 6bbe3a9c805fcb8cd8d396dafd32078181a7cdd5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 14 Sep 2012 02:58:07 -0600
Subject: [PATCH 081/121] Btrfs: kill obsolete arguments in
 btrfs_wait_ordered_extents

nocow_only is now an obsolete argument.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/extent-tree.c  |  4 ++--
 fs/btrfs/ordered-data.c | 12 +-----------
 fs/btrfs/ordered-data.h |  3 +--
 fs/btrfs/relocation.c   |  2 +-
 fs/btrfs/super.c        |  2 +-
 fs/btrfs/transaction.c  |  2 +-
 6 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 27a6b3e6fa41..a3a902fdeb49 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3722,7 +3722,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_ordered_extents(root, 0, 0);
+		btrfs_wait_ordered_extents(root, 0);
 		return;
 	}
 
@@ -3748,7 +3748,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_extents(root, 0, 0);
+			btrfs_wait_ordered_extents(root, 0);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e2b3d994ec01..7772f02ba28e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -468,8 +468,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-				int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 {
 	struct list_head splice;
 	struct list_head *cur;
@@ -484,15 +483,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root,
 		cur = splice.next;
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
-		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-			list_move(&ordered->root_extent_list,
-				  &root->fs_info->ordered_extents);
-			cond_resched_lock(&root->fs_info->ordered_extent_lock);
-			continue;
-		}
-
 		list_del_init(&ordered->root_extent_list);
 		atomic_inc(&ordered->refs);
 
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d1ddaeff1356..dd27a0b46a37 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,8 +190,7 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root,
-				int nocow_only, int delay_iput);
+void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7e7fd1bcfc54..6e530bb86c94 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4058,7 +4058,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	       (unsigned long long)rc->block_group->flags);
 
 	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 903ab2d7068a..5aa3b8182d96 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -852,7 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_ordered_extents(root, 0, 0);
+	btrfs_wait_ordered_extents(root, 0);
 
 	trans = btrfs_join_transaction_freeze(root);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e4bfac8d54b8..c9265a603488 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1475,7 +1475,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		if (flush_on_commit || snap_pending) {
 			btrfs_start_delalloc_inodes(root, 1);
-			btrfs_wait_ordered_extents(root, 0, 1);
+			btrfs_wait_ordered_extents(root, 1);
 		}
 
 		ret = btrfs_run_delayed_items(trans, root);

From 98114659e0d467e2c0ee6f24f2429329328fc312 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 11:22:38 -0400
Subject: [PATCH 082/121] Btrfs: fix race with freeze and free space inodes

So we start our freeze, somebody comes in and does an fsync() on a file
where we have to commit a transaction for whatever reason, and we will
deadlock because the freeze is waiting on FS_FREEZE people to stop writing
to the file system, but the transaction is waiting for its free space inodes
to be written out, which are in turn waiting on sb_start_intwrite while
trying to write the file extents.  To fix this we'll just skip the
sb_start_intwrite() if we TRANS_JOIN_NOLOCK since we're being waited on by a
transaction commit so we're safe wrt to freeze and this will keep us from
deadlocking.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/transaction.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9265a603488..a4fe5494d01b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -342,7 +342,15 @@ again:
 	if (!h)
 		return ERR_PTR(-ENOMEM);
 
-	if (!__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
+	/*
+	 * If we are JOIN_NOLOCK we're already committing a transaction and
+	 * waiting on this guy, so we don't need to do the sb_start_intwrite
+	 * because we're already holding a ref.  We need this because we could
+	 * have raced in and did an fsync() on a file which can kick a commit
+	 * and then we deadlock with somebody doing a freeze.
+	 */
+	if (type != TRANS_JOIN_NOLOCK &&
+	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
 		if (type == TRANS_JOIN_FREEZE)
 			return ERR_PTR(-EPERM);
 		sb_start_intwrite(root->fs_info->sb);
@@ -601,7 +609,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	sb_end_intwrite(root->fs_info->sb);
+	if (lock)
+		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);

From ff44c6e36dc9dcc02652a1105b120bdf08cea9f7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 12:59:20 -0400
Subject: [PATCH 083/121] Btrfs: do not hold the write_lock on the extent tree
 while logging

Dave Sterba pointed out a sleeping while atomic bug while doing fsync.  This
is because I'm an idiot and didn't realize that rwlock's were spin locks, so
we've been holding this thing while doing allocations and such which is not
good.  This patch fixes this by dropping the write lock before we do
anything heavy and re-acquire it when it is done.  We also need to take a
ref on the em's in case their corresponding pages are evicted and mark them
as being logged so that releasepage does not remove them and doesn't remove
them from our local list.  Thanks,

Reported-by: Dave Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent_map.c |  3 ++-
 fs/btrfs/extent_map.h |  1 +
 fs/btrfs/tree-log.c   | 21 +++++++++++++++++----
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8d1364d385d6..b8cbc8d5c7f7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -407,7 +407,8 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
-	list_del_init(&em->list);
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_del_init(&em->list);
 	em->in_tree = 0;
 	return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e6294b51357..679225555f7b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -13,6 +13,7 @@
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
 
 struct extent_map {
 	struct rb_node rb_node;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 038a5229404a..ed1f7ce7219a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2945,6 +2945,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		list_del_init(&em->list);
 		if (em->generation <= test_gen)
 			continue;
+		/* Need a ref to keep it from getting evicted from cache */
+		atomic_inc(&em->refs);
+		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
 		list_add_tail(&em->list, &extents);
 	}
 
@@ -2954,13 +2957,18 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 		em = list_entry(extents.next, struct extent_map, list);
 
 		list_del_init(&em->list);
+		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
 
 		/*
 		 * If we had an error we just need to delete everybody from our
 		 * private list.
 		 */
-		if (ret)
+		if (ret) {
+			free_extent_map(em);
 			continue;
+		}
+
+		write_unlock(&tree->lock);
 
 		/*
 		 * If the previous EM and the last extent we left off on aren't
@@ -2971,21 +2979,26 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 			ret = copy_items(trans, inode, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
-			if (ret)
+			if (ret) {
+				free_extent_map(em);
+				write_lock(&tree->lock);
 				continue;
+			}
 			btrfs_release_path(path);
 			args.nr = 0;
 		}
 
 		ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+		free_extent_map(em);
+		write_lock(&tree->lock);
 	}
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
 
 	if (!ret && args.nr)
 		ret = copy_items(trans, inode, dst_path, args.src,
 				 args.start_slot, args.nr, LOG_INODE_ALL);
 	btrfs_release_path(path);
-	WARN_ON(!list_empty(&extents));
-	write_unlock(&tree->lock);
 	return ret;
 }
 

From b5bae2612af92fd8e7bcdcf7ce3e0259e8d341c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:43:01 -0400
Subject: [PATCH 084/121] Btrfs: fix race when getting the eb out of
 page->private

We can race when checking wether PagePrivate is set on a page and we
actually have an eb saved in the pages private pointer.  We could have
easily written out this page and released it in the time that we did the
pagevec lookup and actually got around to looking at this page.  So use
mapping->private_lock to ensure we get a consistent view of the
page->private pointer.  This is inline with the alloc and releasepage paths
which use private_lock when manipulating page->private.  Thanks,

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent_io.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 90bd9f768c0a..a2c21570adf5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3256,20 +3256,35 @@ retry:
 				break;
 			}
 
+			spin_lock(&mapping->private_lock);
+			if (!PagePrivate(page)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
 			eb = (struct extent_buffer *)page->private;
+
+			/*
+			 * Shouldn't happen and normally this would be a BUG_ON
+			 * but no sense in crashing the users box for something
+			 * we can survive anyway.
+			 */
 			if (!eb) {
+				spin_unlock(&mapping->private_lock);
 				WARN_ON(1);
 				continue;
 			}
 
-			if (eb == prev_eb)
-				continue;
-
-			if (!atomic_inc_not_zero(&eb->refs)) {
-				WARN_ON(1);
+			if (eb == prev_eb) {
+				spin_unlock(&mapping->private_lock);
 				continue;
 			}
 
+			ret = atomic_inc_not_zero(&eb->refs);
+			spin_unlock(&mapping->private_lock);
+			if (!ret)
+				continue;
+
 			prev_eb = eb;
 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
 			if (!ret) {

From 892951a92ebf8b390815b24f9418f297a5081898 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:51:53 -0400
Subject: [PATCH 085/121] Btrfs: remove unused write cache pages hook

The btree inode has it's own write cache pages so we can remove this write
cache pages hook as it's not used.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c | 47 ----------------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5d3f813e13a0..2c18a4eb4d5a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3347,52 +3347,6 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-int btree_lock_page_hook(struct page *page, void *data,
-				void (*flush_fn)(void *))
-{
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_buffer *eb;
-
-	/*
-	 * We culled this eb but the page is still hanging out on the mapping,
-	 * carry on.
-	 */
-	if (!PagePrivate(page))
-		goto out;
-
-	eb = (struct extent_buffer *)page->private;
-	if (!eb) {
-		WARN_ON(1);
-		goto out;
-	}
-	if (page != eb->pages[0])
-		goto out;
-
-	if (!btrfs_try_tree_write_lock(eb)) {
-		flush_fn(data);
-		btrfs_tree_lock(eb);
-	}
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-
-	btrfs_tree_unlock(eb);
-out:
-	if (!trylock_page(page)) {
-		flush_fn(data);
-		lock_page(page);
-	}
-	return 0;
-}
-
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
@@ -3787,7 +3741,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,

From 926ced123bd6651b30a07f65a2a8a0b26154cd58 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 13:58:59 -0400
Subject: [PATCH 086/121] Btrfs: don't do anything in our ->freeze_fs and
 ->unfreeze_fs

We do not need to do anything special to freeze or unfreeze, it's all taken
care of by the generic work, and what we currently have is wrong anyway
since we shouldn't be returnning to userspace with mutexes held anyway.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/super.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5aa3b8182d96..fb260bbf59c6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1511,17 +1511,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_lock(&fs_info->transaction_kthread_mutex);
-	mutex_lock(&fs_info->cleaner_mutex);
 	return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-	mutex_unlock(&fs_info->cleaner_mutex);
-	mutex_unlock(&fs_info->transaction_kthread_mutex);
 	return 0;
 }
 

From c3308f84c1743eabb91f4976a314d118d5ea2342 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 14 Sep 2012 14:51:22 -0400
Subject: [PATCH 087/121] Btrfs: fix punch hole when no extent exists

I saw the warning in btrfs_drop_extent_cache where our end is less than our
start while running xfstests 68 in a loop.  This is because we
unconditionally do drop_end = min(end, extent_end) in
__btrfs_drop_extents(), even though we may not have found an extent in the
range we were looking to drop.  So keep track of wether or not we found
something, and if we didn't just use our end.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d0fc4c5aaf15..110d3cb7b6fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -609,6 +609,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int ret;
 	int modify_tree = -1;
 	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+	int found = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -674,6 +675,7 @@ next_slot:
 			goto next_slot;
 		}
 
+		found = 1;
 		search_start = max(key.offset, start);
 		if (recow || !modify_tree) {
 			modify_tree = -1;
@@ -829,7 +831,7 @@ next_slot:
 	}
 
 	if (drop_end)
-		*drop_end = min(end, extent_end);
+		*drop_end = found ? min(end, extent_end) : end;
 	btrfs_release_path(path);
 	return ret;
 }

From 7e97b8daf63487c20f78487bd4045f39b0d97cf4 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 7 Sep 2012 05:56:55 -0600
Subject: [PATCH 088/121] btrfs: allow setting NOCOW for a zero sized file via
 ioctl

Hi,

the patch si simple, but it has user visible impact and I'm not quite sure how
to resolve it.

In short, $subj says it, chattr -C supports it and we want to use it.

The conditions that acutally allow to change the NOCOW flag are clear. What if
I try to set the flag on a file that is not empty? Options:

1) whole ioctl will fail, EINVAL
2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file
     will stay COW-ed and checksummed
2.2) ioctl will succeed, flag will not be removed and a syslog message will
     warn that the COW flag has not been changed
2.2.1) dtto, no syslog message

Man page of chattr states that

 "If it is set on a file which already has data blocks, it is undefined when
 the blocks assigned to the file will be fully stable."

Yes, it's undefined and with current implementation it'll never happen. So from
this end, the user cannot expect anything. I'm trying to find a reasonable
behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of
flags in a deep directory does not fail unnecessarily and does not pollute the
log.

My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting
the fact that I know the code and otherwise would look there before consulting
the documentation.

The patch implements 2.2.1.

david

-------------8<-------------------
From: David Sterba <dsterba@suse.cz>

It's safe to turn off checksums for a zero sized file.

http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030

"We cannot switch on NODATASUM for a file that already has extents that
are checksummed. The invariant here is that either all the extents or
none are checksummed.

Theoretically it's possible to add/remove all checksums from a given
file, but it's a potentially longtime operation, the file has to be in
some intermediate state where the checksums partially exist but have to
be ignored (for the csum->nocsum) until the file is fully converted,
this brings more special cases to extent handling, it has to survive
power failure and remain consistent, and probably needs to be restarted
after next mount."

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ioctl.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 38e3d6ab89d3..4d7f4bbf4c96 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -181,6 +181,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	int ret;
 	u64 ip_oldflags;
 	unsigned int i_oldflags;
+	umode_t mode;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -203,6 +204,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 	ip_oldflags = ip->flags;
 	i_oldflags = inode->i_flags;
+	mode = inode->i_mode;
 
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
@@ -237,10 +239,31 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		ip->flags |= BTRFS_INODE_DIRSYNC;
 	else
 		ip->flags &= ~BTRFS_INODE_DIRSYNC;
-	if (flags & FS_NOCOW_FL)
-		ip->flags |= BTRFS_INODE_NODATACOW;
-	else
-		ip->flags &= ~BTRFS_INODE_NODATACOW;
+	if (flags & FS_NOCOW_FL) {
+		if (S_ISREG(mode)) {
+			/*
+			 * It's safe to turn csums off here, no extents exist.
+			 * Otherwise we want the flag to reflect the real COW
+			 * status of the file and will not set it.
+			 */
+			if (inode->i_size == 0)
+				ip->flags |= BTRFS_INODE_NODATACOW
+					   | BTRFS_INODE_NODATASUM;
+		} else {
+			ip->flags |= BTRFS_INODE_NODATACOW;
+		}
+	} else {
+		/*
+		 * Revert back under same assuptions as above
+		 */
+		if (S_ISREG(mode)) {
+			if (inode->i_size == 0)
+				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				             | BTRFS_INODE_NODATASUM);
+		} else {
+			ip->flags &= ~BTRFS_INODE_NODATACOW;
+		}
+	}
 
 	/*
 	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS

From aa42ffd918c420d5625b25b7a0bc2bbde4c9f890 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 18 Sep 2012 03:52:23 -0600
Subject: [PATCH 089/121] Btrfs: fix off-by-one in file clone

Btrfs uses inclusive range end for lock_extent(), unlock_extent() and
related functions, so we made off-by-one errors in file clone.

This fixes it and also fixes some style problems.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/ioctl.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d7f4bbf4c96..d6836af6d60f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2481,13 +2481,13 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	   another, and lock file content */
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
-		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
+		lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
 		if (!ordered &&
-		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
-				   EXTENT_DELALLOC, 0, NULL))
+		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
+				    EXTENT_DELALLOC, 0, NULL))
 			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 		btrfs_wait_ordered_range(src, off, len);
@@ -2561,7 +2561,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_release_path(path);
 
 			if (key.offset + datal <= off ||
-			    key.offset >= off+len)
+			    key.offset >= off + len - 1)
 				goto next;
 
 			memcpy(&new_key, &key, sizeof(new_key));
@@ -2662,8 +2662,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					new_key.offset += skip;
 				}
 
-				if (key.offset + datal > off+len)
-					trim = key.offset + datal - (off+len);
+				if (key.offset + datal > off + len)
+					trim = key.offset + datal - (off + len);
 
 				if (comp && (skip || trim)) {
 					ret = -EINVAL;
@@ -2740,7 +2740,7 @@ next:
 	ret = 0;
 out:
 	btrfs_release_path(path);
-	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
 	mutex_unlock(&inode->i_mutex);

From 8732d44f806a9da9a7ca4d1704b8a1ed81639bc4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 17 Sep 2012 23:52:38 -0600
Subject: [PATCH 090/121] Btrfs: fix the missing error information in
 create_pending_snapshot()

The macro btrfs_abort_transaction() can get the line number of the code
where the problem happens, so we should invoke it in the place that the
error occurs, or we will lose the line number.

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/transaction.c | 57 ++++++++++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a4fe5494d01b..910ff8051ba9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1073,7 +1073,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		goto fail;
 	} else if (IS_ERR(dir_item)) {
 		ret = PTR_ERR(dir_item);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 	btrfs_release_path(path);
 
@@ -1084,8 +1085,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret)	/* Transaction aborted */
-		goto abort_trans;
+	if (ret) {	/* Transaction aborted */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -1118,7 +1121,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret) {
 		btrfs_tree_unlock(old);
 		free_extent_buffer(old);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 
 	btrfs_set_lock_blocking(old);
@@ -1127,8 +1131,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	/* clean up in any case */
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
@@ -1140,8 +1146,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/*
 	 * insert root back/forward references
@@ -1150,23 +1158,30 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				 parent_root->root_key.objectid,
 				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	if (IS_ERR(pending->snap)) {
 		ret = PTR_ERR(pending->snap);
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
 	}
 
 	ret = btrfs_reloc_post_snapshot(trans, pending);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_insert_dir_item(trans, parent_root,
 				    dentry->d_name.name, dentry->d_name.len,
@@ -1174,15 +1189,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				    BTRFS_FT_DIR, index);
 	/* We have check then name at the beginning, so it is impossible. */
 	BUG_ON(ret == -EEXIST);
-	if (ret)
-		goto abort_trans;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
 					 dentry->d_name.len * 2);
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	if (ret)
-		goto abort_trans;
+		btrfs_abort_transaction(trans, root, ret);
 fail:
 	dput(parent);
 	trans->block_rsv = rsv;
@@ -1193,10 +1210,6 @@ root_item_alloc_fail:
 path_alloc_fail:
 	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
 	return ret;
-
-abort_trans:
-	btrfs_abort_transaction(trans, root, ret);
-	goto fail;
 }
 
 /*

From 005d6427ac4f276d937a36ca6a1d62b181ed70bf Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 18 Sep 2012 07:52:32 -0600
Subject: [PATCH 091/121] btrfs: move transaction aborts to the point of
 failure

Call btrfs_abort_transaction as early as possible when an error
condition is detected, that way the line number reported is useful
and we're not clueless anymore which error path led to the abort.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ctree.h       |  5 ++++
 fs/btrfs/extent-tree.c | 56 ++++++++++++++++++++++++++----------------
 fs/btrfs/root-tree.c   | 29 +++++++++++++---------
 fs/btrfs/volumes.c     | 37 +++++++++++++++++-----------
 4 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1372057e1ec1..399521ab61da 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3400,6 +3400,11 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
 	}
 }
 
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
 #define btrfs_abort_transaction(trans, root, errno)		\
 do {								\
 	__btrfs_abort_transaction(trans, root, __func__,	\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a3a902fdeb49..395e222e39ab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5123,8 +5123,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			ret = remove_extent_backref(trans, extent_root, path,
 						    NULL, refs_to_drop,
 						    is_data);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			btrfs_release_path(path);
 			path->leave_spinning = 1;
 
@@ -5142,8 +5144,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 					btrfs_print_leaf(extent_root,
 							 path->nodes[0]);
 			}
-			if (ret < 0)
-				goto abort;
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			extent_slot = path->slots[0];
 		}
 	} else if (ret == -ENOENT) {
@@ -5157,7 +5161,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		       (unsigned long long)owner_objectid,
 		       (unsigned long long)owner_offset);
 	} else {
-		goto abort;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -5167,8 +5172,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		BUG_ON(found_extent || extent_slot != path->slots[0]);
 		ret = convert_extent_item_v0(trans, extent_root, path,
 					     owner_objectid, 0);
-		if (ret < 0)
-			goto abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 
 		btrfs_release_path(path);
 		path->leave_spinning = 1;
@@ -5185,8 +5192,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       (unsigned long long)bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
-		if (ret < 0)
-			goto abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
 		extent_slot = path->slots[0];
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -5223,8 +5233,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			ret = remove_extent_backref(trans, extent_root, path,
 						    iref, refs_to_drop,
 						    is_data);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 	} else {
 		if (found_extent) {
@@ -5241,27 +5253,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
 
 		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-			if (ret)
-				goto abort;
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-		if (ret)
-			goto abort;
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 	}
 out:
 	btrfs_free_path(path);
 	return ret;
-
-abort:
-	btrfs_abort_transaction(trans, extent_root, ret);
-	goto out;
 }
 
 /*
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 10d8e4d88071..eb923d087da7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -141,8 +141,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0)
-		goto out_abort;
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
@@ -166,16 +168,23 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_release_path(path);
 		ret = btrfs_search_slot(trans, root, key, path,
 				-1, 1);
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+
 		ret = btrfs_del_item(trans, root, path);
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
 		ret = btrfs_insert_empty_item(trans, root, path,
 				key, sizeof(*item));
-		if (ret < 0)
-			goto out_abort;
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		l = path->nodes[0];
 		slot = path->slots[0];
 		ptr = btrfs_item_ptr_offset(l, slot);
@@ -192,10 +201,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
 	btrfs_free_path(path);
 	return ret;
-
-out_abort:
-	btrfs_abort_transaction(trans, root, ret);
-	goto out;
 }
 
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969aeeb71..4f4de51b5bdd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1775,15 +1775,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 		ret = btrfs_finish_sprout(trans, root);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 	} else {
 		ret = btrfs_add_device(trans, root, device);
-		if (ret)
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto error_trans;
+		}
 	}
 
 	/*
@@ -1814,7 +1820,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 error_trans:
 	unlock_chunks(root);
-	btrfs_abort_transaction(trans, root, ret);
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
 	kfree(device);
@@ -3608,12 +3613,16 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
 				  &sys_chunk_size, &sys_stripe_size,
 				  sys_chunk_offset, alloc_profile);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	/*
 	 * Modifying chunk tree needs allocating new blocks from both
@@ -3623,19 +3632,19 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 	 */
 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
 				   chunk_size, stripe_size);
-	if (ret)
-		goto abort;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
 				   sys_chunk_offset, sys_chunk_size,
 				   sys_stripe_size);
 	if (ret)
-		goto abort;
+		btrfs_abort_transaction(trans, root, ret);
 
-	return 0;
+out:
 
-abort:
-	btrfs_abort_transaction(trans, root, ret);
 	return ret;
 }
 

From 0aa4a17d8232e7d8a42763b53bb9943bc0484b18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 19 Sep 2012 15:42:38 -0400
Subject: [PATCH 092/121] Btrfs: handle not finding the extent exactly when
 logging changed extents

I started hitting warnings when running xfstest 68 in a loop because there
were EM's that were not lined up properly with the physical extents.  This
is ok, if we do something like punch a hole or write to a preallocated space
or something like that we can have an EM that doesn't cover the entire
physical extent.  So fix the tree logging stuff to cope with this case so we
don't just commit the transaction.  With this patch I no longer see the
warnings from the tree logging code.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/tree-log.c | 46 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ed1f7ce7219a..e0ef92f91d66 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2833,6 +2833,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 start = em->mod_start;
+	u64 search_start = start;
 	u64 len = em->mod_len;
 	u64 num_bytes;
 	int nritems;
@@ -2848,23 +2849,55 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	while (len) {
 		if (args->nr)
 			goto next_slot;
+again:
 		key.objectid = btrfs_ino(inode);
 		key.type = BTRFS_EXTENT_DATA_KEY;
-		key.offset = start;
+		key.offset = search_start;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			return ret;
+
 		if (ret) {
 			/*
-			 * This shouldn't happen, but it might so warn and
-			 * return an error.
+			 * A rare case were we can have an em for a section of a
+			 * larger extent so we need to make sure that this em
+			 * falls within the extent we've found.  If not we just
+			 * bail and go back to ye-olde way of doing things but
+			 * it happens often enough in testing that we need to do
+			 * this dance to make sure.
 			 */
-			WARN_ON(1);
-			return -ENOENT;
+			do {
+				if (path->slots[0] == 0) {
+					btrfs_release_path(path);
+					if (search_start == 0)
+						return -ENOENT;
+					search_start--;
+					goto again;
+				}
+
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid != btrfs_ino(inode) ||
+				    key.type != BTRFS_EXTENT_DATA_KEY) {
+					btrfs_release_path(path);
+					return -ENOENT;
+				}
+			} while (key.offset > start);
+
+			fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					    struct btrfs_file_extent_item);
+			num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
+								fi);
+			if (key.offset + num_bytes <= start) {
+				btrfs_release_path(path);
+				return -ENOENT;
+			}
 		}
 		args->src = path->nodes[0];
 next_slot:
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 		fi = btrfs_item_ptr(args->src, path->slots[0],
 				    struct btrfs_file_extent_item);
 		if (args->nr &&
@@ -2898,8 +2931,9 @@ next_slot:
 		} else {
 			len -= num_bytes;
 		}
-		start += btrfs_file_extent_num_bytes(args->src, fi);
+		start = key.offset + num_bytes;
 		args->next_offset = start;
+		search_start = start;
 
 		if (path->slots[0] < nritems) {
 			if (len)

From 5a1d7843ca4b3a9009bea87f85ad33854b910aea Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 17 Aug 2012 14:04:41 -0700
Subject: [PATCH 093/121] btrfs: improved readablity for add_inode_ref

Moved part of the code into a sub function and replaced most of the gotos
by ifs, hoping that it will be easier to read now.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/tree-log.c | 232 +++++++++++++++++++++++---------------------
 1 file changed, 124 insertions(+), 108 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e0ef92f91d66..15dae589e59f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -785,6 +785,95 @@ out:
 	return match;
 }
 
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_root *log_root,
+				  struct inode *dir, struct inode *inode,
+				  struct btrfs_key *key,
+				  struct extent_buffer *eb,
+				  struct btrfs_inode_ref *ref,
+				  char *name, int namelen, int *search_done)
+{
+	int ret;
+	struct btrfs_dir_item *di;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			return 1;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while (ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log_root, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(path);
+
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				btrfs_run_delayed_items(trans, root);
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+
+		/*
+		 * NOTE: we have searched root tree and checked the
+		 * coresponding ref, it does not need to check again.
+		 */
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicing name */
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
+	return 0;
+}
 
 /*
  * replay one inode back reference item found in the log tree.
@@ -800,7 +889,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_key *key)
 {
 	struct btrfs_inode_ref *ref;
-	struct btrfs_dir_item *di;
 	struct inode *dir;
 	struct inode *inode;
 	unsigned long ref_ptr;
@@ -829,126 +917,54 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	ref_ptr = btrfs_item_ptr_offset(eb, slot);
 	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
 
-again:
-	ref = (struct btrfs_inode_ref *)ref_ptr;
+	while (ref_ptr < ref_end) {
+		ref = (struct btrfs_inode_ref *)ref_ptr;
 
-	namelen = btrfs_inode_ref_name_len(eb, ref);
-	name = kmalloc(namelen, GFP_NOFS);
-	BUG_ON(!name);
+		namelen = btrfs_inode_ref_name_len(eb, ref);
+		name = kmalloc(namelen, GFP_NOFS);
+		BUG_ON(!name);
 
-	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+		read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
 
-	/* if we already have a perfect match, we're done */
-	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-			 btrfs_inode_ref_index(eb, ref),
-			 name, namelen)) {
-		goto out;
-	}
+		/* if we already have a perfect match, we're done */
+		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+				  btrfs_inode_ref_index(eb, ref),
+				  name, namelen)) {
+			/*
+			 * look for a conflicting back reference in the
+			 * metadata. if we find one we have to unlink that name
+			 * of the file before we add our new link.  Later on, we
+			 * overwrite any existing back reference, and we don't
+			 * want to create dangling pointers in the directory.
+			 */
 
-	/*
-	 * look for a conflicting back reference in the metadata.
-	 * if we find one we have to unlink that name of the file
-	 * before we add our new link.  Later on, we overwrite any
-	 * existing back reference, and we don't want to create
-	 * dangling pointers in the directory.
-	 */
-
-	if (search_done)
-		goto insert;
-
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
-	if (ret == 0) {
-		char *victim_name;
-		int victim_name_len;
-		struct btrfs_inode_ref *victim_ref;
-		unsigned long ptr;
-		unsigned long ptr_end;
-		struct extent_buffer *leaf = path->nodes[0];
-
-		/* are we trying to overwrite a back ref for the root directory
-		 * if so, just jump out, we're done
-		 */
-		if (key->objectid == key->offset)
-			goto out_nowrite;
-
-		/* check all the names in this back reference to see
-		 * if they are in the log.  if so, we allow them to stay
-		 * otherwise they must be unlinked as a conflict
-		 */
-		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
-		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
-		while (ptr < ptr_end) {
-			victim_ref = (struct btrfs_inode_ref *)ptr;
-			victim_name_len = btrfs_inode_ref_name_len(leaf,
-								   victim_ref);
-			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			BUG_ON(!victim_name);
-
-			read_extent_buffer(leaf, victim_name,
-					   (unsigned long)(victim_ref + 1),
-					   victim_name_len);
-
-			if (!backref_in_log(log, key, victim_name,
-					    victim_name_len)) {
-				btrfs_inc_nlink(inode);
-				btrfs_release_path(path);
-
-				ret = btrfs_unlink_inode(trans, root, dir,
-							 inode, victim_name,
-							 victim_name_len);
-				btrfs_run_delayed_items(trans, root);
+			if (!search_done) {
+				ret = __add_inode_ref(trans, root, path, log,
+						      dir, inode, key, eb, ref,
+						      name, namelen,
+						      &search_done);
+				if (ret == 1)
+					goto out;
+				BUG_ON(ret);
 			}
-			kfree(victim_name);
-			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+
+			/* insert our name */
+			ret = btrfs_add_link(trans, dir, inode, name, namelen,
+					     0, btrfs_inode_ref_index(eb, ref));
+			BUG_ON(ret);
+
+			btrfs_update_inode(trans, root, inode);
 		}
-		BUG_ON(ret);
 
-		/*
-		 * NOTE: we have searched root tree and checked the
-		 * coresponding ref, it does not need to check again.
-		 */
-		search_done = 1;
+		ref_ptr = (unsigned long)(ref + 1) + namelen;
+		kfree(name);
 	}
-	btrfs_release_path(path);
-
-	/* look for a conflicting sequence number */
-	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-					 btrfs_inode_ref_index(eb, ref),
-					 name, namelen, 0);
-	if (di && !IS_ERR(di)) {
-		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
-	}
-	btrfs_release_path(path);
-
-	/* look for a conflicing name */
-	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
-				   name, namelen, 0);
-	if (di && !IS_ERR(di)) {
-		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
-	}
-	btrfs_release_path(path);
-
-insert:
-	/* insert our name */
-	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-			     btrfs_inode_ref_index(eb, ref));
-	BUG_ON(ret);
-
-	btrfs_update_inode(trans, root, inode);
-
-out:
-	ref_ptr = (unsigned long)(ref + 1) + namelen;
-	kfree(name);
-	if (ref_ptr < ref_end)
-		goto again;
 
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
 
-out_nowrite:
+out:
 	btrfs_release_path(path);
 	iput(dir);
 	iput(inode);

From f186373fef005cee948a4a39e6a14c2e5f517298 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Wed, 8 Aug 2012 11:32:27 -0700
Subject: [PATCH 094/121] btrfs: extended inode refs

This patch adds basic support for extended inode refs. This includes support
for link and unlink of the refs, which basically gets us support for rename
as well.

Inode creation does not need changing - extended refs are only added after
the ref array is full.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/backref.c    |  68 +++++++++
 fs/btrfs/backref.h    |   5 +
 fs/btrfs/ctree.h      |  53 ++++++-
 fs/btrfs/hash.h       |  10 ++
 fs/btrfs/inode-item.c | 287 ++++++++++++++++++++++++++++++++--
 fs/btrfs/inode.c      |  23 +--
 fs/btrfs/tree-log.c   | 347 +++++++++++++++++++++++++++++++++++-------
 7 files changed, 712 insertions(+), 81 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7084431b7c9c..dc963121d1db 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1109,6 +1109,74 @@ static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 				found_key);
 }
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off)
+{
+	int ret, slot;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = start_off;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * If the item at offset is not found,
+			 * btrfs_search_slot will point us to the slot
+			 * where it should be inserted. In our case
+			 * that will be the slot directly before the
+			 * next INODE_REF_KEY_V2 item. In the case
+			 * that we're pointing to the last slot in a
+			 * leaf, we must move one leaf over.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				if (ret >= 1)
+					ret = -ENOENT;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/*
+		 * Check that we're still looking at an extended ref key for
+		 * this particular objectid. If we have different
+		 * objectid or type then there are no more to be found
+		 * in the tree and we can exit.
+		 */
+		ret = -ENOENT;
+		if (found_key.objectid != inode_objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		ret = 0;
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		extref = (struct btrfs_inode_extref *)ptr;
+		*ret_extref = extref;
+		if (found_off)
+			*found_off = found_key.offset;
+		break;
+	}
+
+	return ret;
+}
+
 /*
  * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
  * of the path are separated by '/' and the path is guaranteed to be
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 4fda5d8029a7..0b920c113952 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -70,4 +70,9 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
 void free_ipath(struct inode_fs_paths *ipath);
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off);
+
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 399521ab61da..50dcd0fbae11 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -154,6 +154,13 @@ struct btrfs_ordered_sum;
  */
 #define BTRFS_NAME_LEN 255
 
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
 
@@ -489,6 +496,8 @@ struct btrfs_super_block {
  */
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
 
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP			\
@@ -496,7 +505,8 @@ struct btrfs_super_block {
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
 	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -643,6 +653,14 @@ struct btrfs_inode_ref {
 	/* name goes here */
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_extref {
+	__le64 parent_objectid;
+	__le64 index;
+	__le16 name_len;
+	__u8   name[0];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
@@ -1601,6 +1619,7 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		12
+#define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1987,6 +2006,13 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -3184,12 +3210,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -3197,6 +3223,19 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
 
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+				   u64 ref_objectid, const char *name,
+				   int name_len,
+				   struct btrfs_inode_extref **extref_ret);
+
 /* file-item.c */
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index db2ff9773b99..1d982812ab67 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -24,4 +24,14 @@ static inline u64 btrfs_name_hash(const char *name, int len)
 {
 	return crc32c((u32)~1, name, len);
 }
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+				    int len)
+{
+	return (u64) crc32c(parent_objectid, name, len);
+}
+
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index a13cf1a96c73..48b8fda93132 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,6 +18,7 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
 #include "print-tree.h"
 
@@ -50,18 +51,57 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
 	return 0;
 }
 
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod)
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+				   const char *name, int name_len,
+				   struct btrfs_inode_extref **extref_ret)
 {
+	struct extent_buffer *leaf;
+	struct btrfs_inode_extref *extref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	/*
+	 * Search all extended backrefs in this item. We're only
+	 * looking through any collisions so most of the time this is
+	 * just going to compare against one buffer. If all is well,
+	 * we'll return success and the inode ref object.
+	 */
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+		name_ptr = (unsigned long)(&extref->name);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+		if (ref_name_len == name_len &&
+		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+			if (extref_ret)
+				*extref_ret = extref;
+			return 1;
+		}
+
+		cur_offset += ref_name_len + sizeof(*extref);
+	}
+	return 0;
+}
+
+static struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       const char *name, int name_len,
+		       u64 inode_objectid, u64 ref_objectid, int ins_len,
+		       int cow)
+{
+	int ret;
 	struct btrfs_key key;
 	struct btrfs_inode_ref *ref;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	int ret;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_REF_KEY;
@@ -77,10 +117,147 @@ btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
 	return ref;
 }
 
-int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+		return NULL;
+	return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index)
+{
+	struct btrfs_inode_ref *ref;
+	struct btrfs_inode_extref *extref;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+				     inode_objectid, ref_objectid, ins_len,
+				     cow);
+	if (IS_ERR(ref))
+		return PTR_ERR(ref);
+
+	if (ref != NULL) {
+		*ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+		return 0;
+	}
+
+	btrfs_release_path(path);
+
+	extref = btrfs_lookup_inode_extref(trans, root, path, name,
+					   name_len, inode_objectid,
+					   ref_objectid, ins_len, cow);
+	if (IS_ERR(extref))
+		return PTR_ERR(extref);
+
+	if (extref) {
+		*ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	int ret;
+	int del_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Sanity check - did we find the right item for this name?
+	 * This should always succeed so error here will make the FS
+	 * readonly.
+	 */
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+					    name, name_len, &extref)) {
+		btrfs_std_error(root->fs_info, -ENOENT);
+		ret = -EROFS;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (index)
+		*index = btrfs_inode_extref_index(leaf, extref);
+
+	if (del_len == item_size) {
+		/*
+		 * Common case only one ref in the item, remove the
+		 * whole item.
+		 */
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+
+	ptr = (unsigned long)extref;
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	memmove_extent_buffer(leaf, ptr, ptr + del_len,
+			      item_size - (ptr + del_len - item_start));
+
+	btrfs_truncate_item(trans, root, path, item_size - del_len, 1);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -91,6 +268,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	u32 item_size;
 	u32 sub_item_len;
 	int ret;
+	int search_ext_refs = 0;
 	int del_len = name_len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
@@ -106,12 +284,14 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	} else if (ret < 0) {
 		goto out;
 	}
 	if (!find_name_in_backref(path, name, name_len, &ref)) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -129,8 +309,78 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	btrfs_truncate_item(trans, root, path,
-				  item_size - sub_item_len, 1);
+	btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+
+	if (search_ext_refs) {
+		/*
+		 * No refs were found, or we could not find the
+		 * name in our ref array. Find and remove the extended
+		 * inode ref then.
+		 */
+		return btrfs_del_inode_extref(trans, root, name, name_len,
+					      inode_objectid, ref_objectid, index);
+	}
+
+	return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_inode_extref *extref;
+	int ret;
+	int ins_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, name_len, NULL))
+			goto out;
+
+		btrfs_extend_item(trans, root, path, ins_len);
+		ret = 0;
+	}
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+	ptr += btrfs_item_size(leaf, item) - ins_len;
+	extref = (struct btrfs_inode_extref *)ptr;
+
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+	ptr = (unsigned long)&extref->name;
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -191,6 +441,19 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 
 out:
 	btrfs_free_path(path);
+
+	if (ret == -EMLINK) {
+		struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+		/* We ran out of space in the ref array. Need to
+		 * add an extended ref. */
+		if (btrfs_super_incompat_flags(disk_super)
+		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+			ret = btrfs_insert_inode_extref(trans, root, name,
+							name_len,
+							inode_objectid,
+							ref_objectid, index);
+	}
+
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c50f7c4f5ac..596305e4d75b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2903,7 +2903,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
-	struct btrfs_inode_ref *ref;
 	struct btrfs_dir_item *di;
 	struct inode *inode = dentry->d_inode;
 	u64 index;
@@ -3017,17 +3016,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	}
 	btrfs_release_path(path);
 
-	ref = btrfs_lookup_inode_ref(trans, root, path,
-				dentry->d_name.name, dentry->d_name.len,
-				ino, dir_ino, 0);
-	if (IS_ERR(ref)) {
-		err = PTR_ERR(ref);
+	ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
+					dentry->d_name.len, ino, dir_ino, 0,
+					&index);
+	if (ret) {
+		err = ret;
 		goto out;
 	}
-	BUG_ON(!ref); /* Logic error */
+
 	if (check_path_shared(root, path))
 		goto out;
-	index = btrfs_inode_ref_index(path->nodes[0], ref);
+
 	btrfs_release_path(path);
 
 	/*
@@ -4743,6 +4742,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
 
+	/*
+	 * Start new inodes with an inode_ref. This is slightly more
+	 * efficient for small numbers of hard links since they will
+	 * be packed into one item. Extended refs will kick in if we
+	 * add more hard links than can fit in the ref item.
+	 */
 	key[1].objectid = objectid;
 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
 	key[1].offset = ref_objectid;
@@ -5049,7 +5054,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EXDEV;
 
-	if (inode->i_nlink == ~0U)
+	if (inode->i_nlink >= BTRFS_LINK_MAX)
 		return -EMLINK;
 
 	err = btrfs_set_inode_index(dir, &index);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 15dae589e59f..1d7b34844323 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -24,8 +24,10 @@
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
+#include "backref.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "hash.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -743,6 +745,7 @@ out:
  */
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
+				   u64 ref_objectid,
 				   char *name, int namelen)
 {
 	struct btrfs_path *path;
@@ -763,8 +766,17 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	if (ret != 0)
 		goto out;
 
-	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, namelen, NULL))
+			match = 1;
+
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		ref = (struct btrfs_inode_ref *)ptr;
@@ -790,27 +802,36 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_path *path,
 				  struct btrfs_root *log_root,
 				  struct inode *dir, struct inode *inode,
-				  struct btrfs_key *key,
 				  struct extent_buffer *eb,
-				  struct btrfs_inode_ref *ref,
-				  char *name, int namelen, int *search_done)
+				  u64 inode_objectid, u64 parent_objectid,
+				  u64 ref_index, char *name, int namelen,
+				  int *search_done)
 {
 	int ret;
+	char *victim_name;
+	int victim_name_len;
+	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
+	struct btrfs_key search_key;
+	struct btrfs_inode_extref *extref;
 
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+again:
+	/* Search old style refs */
+	search_key.objectid = inode_objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = parent_objectid;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret == 0) {
-		char *victim_name;
-		int victim_name_len;
 		struct btrfs_inode_ref *victim_ref;
 		unsigned long ptr;
 		unsigned long ptr_end;
-		struct extent_buffer *leaf = path->nodes[0];
+
+		leaf = path->nodes[0];
 
 		/* are we trying to overwrite a back ref for the root directory
 		 * if so, just jump out, we're done
 		 */
-		if (key->objectid == key->offset)
+		if (search_key.objectid == search_key.offset)
 			return 1;
 
 		/* check all the names in this back reference to see
@@ -830,7 +851,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 					   (unsigned long)(victim_ref + 1),
 					   victim_name_len);
 
-			if (!backref_in_log(log_root, key, victim_name,
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid,
+					    victim_name,
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(path);
@@ -838,9 +861,14 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
+				BUG_ON(ret);
 				btrfs_run_delayed_items(trans, root);
+				kfree(victim_name);
+				*search_done = 1;
+				goto again;
 			}
 			kfree(victim_name);
+
 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 		}
 		BUG_ON(ret);
@@ -853,10 +881,74 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
+	/* Same search but for extended refs */
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+					   inode_objectid, parent_objectid, 0,
+					   0);
+	if (!IS_ERR_OR_NULL(extref)) {
+		u32 item_size;
+		u32 cur_offset = 0;
+		unsigned long base;
+		struct inode *victim_parent;
+
+		leaf = path->nodes[0];
+
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+				goto next;
+
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+					   victim_name_len);
+
+			search_key.objectid = inode_objectid;
+			search_key.type = BTRFS_INODE_EXTREF_KEY;
+			search_key.offset = btrfs_extref_hash(parent_objectid,
+							      victim_name,
+							      victim_name_len);
+			ret = 0;
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid, victim_name,
+					    victim_name_len)) {
+				ret = -ENOENT;
+				victim_parent = read_one_inode(root,
+							       parent_objectid);
+				if (victim_parent) {
+					btrfs_inc_nlink(inode);
+					btrfs_release_path(path);
+
+					ret = btrfs_unlink_inode(trans, root,
+								 victim_parent,
+								 inode,
+								 victim_name,
+								 victim_name_len);
+					btrfs_run_delayed_items(trans, root);
+				}
+				BUG_ON(ret);
+				iput(victim_parent);
+				kfree(victim_name);
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+			BUG_ON(ret);
+next:
+			cur_offset += victim_name_len + sizeof(*extref);
+		}
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
 	/* look for a conflicting sequence number */
 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-					 btrfs_inode_ref_index(eb, ref),
-					 name, namelen, 0);
+					 ref_index, name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
 		BUG_ON(ret);
@@ -875,6 +967,48 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
+{
+	struct btrfs_inode_extref *extref;
+
+	extref = (struct btrfs_inode_extref *)ref_ptr;
+
+	*namelen = btrfs_inode_extref_name_len(eb, extref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+			   *namelen);
+
+	*index = btrfs_inode_extref_index(eb, extref);
+	if (parent_objectid)
+		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+	return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
+{
+	struct btrfs_inode_ref *ref;
+
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	*namelen = btrfs_inode_ref_name_len(eb, ref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+	*index = btrfs_inode_ref_index(eb, ref);
+
+	return 0;
+}
+
 /*
  * replay one inode back reference item found in the log tree.
  * eb, slot and key refer to the buffer and key found in the log tree.
@@ -888,7 +1022,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb, int slot,
 				  struct btrfs_key *key)
 {
-	struct btrfs_inode_ref *ref;
 	struct inode *dir;
 	struct inode *inode;
 	unsigned long ref_ptr;
@@ -897,6 +1030,27 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	int namelen;
 	int ret;
 	int search_done = 0;
+	int log_ref_ver = 0;
+	u64 parent_objectid;
+	u64 inode_objectid;
+	u64 ref_index;
+	int ref_struct_size;
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *r;
+
+		ref_struct_size = sizeof(struct btrfs_inode_extref);
+		log_ref_ver = 1;
+		r = (struct btrfs_inode_extref *)ref_ptr;
+		parent_objectid = btrfs_inode_extref_parent(eb, r);
+	} else {
+		ref_struct_size = sizeof(struct btrfs_inode_ref);
+		parent_objectid = key->offset;
+	}
+	inode_objectid = key->objectid;
 
 	/*
 	 * it is possible that we didn't log all the parent directories
@@ -904,32 +1058,38 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	 * copy the back ref in.  The link count fixup code will take
 	 * care of the rest
 	 */
-	dir = read_one_inode(root, key->offset);
+	dir = read_one_inode(root, parent_objectid);
 	if (!dir)
 		return -ENOENT;
 
-	inode = read_one_inode(root, key->objectid);
+	inode = read_one_inode(root, inode_objectid);
 	if (!inode) {
 		iput(dir);
 		return -EIO;
 	}
 
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
-
 	while (ref_ptr < ref_end) {
-		ref = (struct btrfs_inode_ref *)ref_ptr;
-
-		namelen = btrfs_inode_ref_name_len(eb, ref);
-		name = kmalloc(namelen, GFP_NOFS);
-		BUG_ON(!name);
-
-		read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+		if (log_ref_ver) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
+			/*
+			 * parent object can change from one array
+			 * item to another.
+			 */
+			if (!dir)
+				dir = read_one_inode(root, parent_objectid);
+			if (!dir)
+				return -ENOENT;
+		} else {
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
+		}
+		if (ret)
+			return ret;
 
 		/* if we already have a perfect match, we're done */
 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-				  btrfs_inode_ref_index(eb, ref),
-				  name, namelen)) {
+				  ref_index, name, namelen)) {
 			/*
 			 * look for a conflicting back reference in the
 			 * metadata. if we find one we have to unlink that name
@@ -940,8 +1100,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			if (!search_done) {
 				ret = __add_inode_ref(trans, root, path, log,
-						      dir, inode, key, eb, ref,
-						      name, namelen,
+						      dir, inode, eb,
+						      inode_objectid,
+						      parent_objectid,
+						      ref_index, name, namelen,
 						      &search_done);
 				if (ret == 1)
 					goto out;
@@ -950,14 +1112,18 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 			/* insert our name */
 			ret = btrfs_add_link(trans, dir, inode, name, namelen,
-					     0, btrfs_inode_ref_index(eb, ref));
+					     0, ref_index);
 			BUG_ON(ret);
 
 			btrfs_update_inode(trans, root, inode);
 		}
 
-		ref_ptr = (unsigned long)(ref + 1) + namelen;
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
 		kfree(name);
+		if (log_ref_ver) {
+			iput(dir);
+			dir = NULL;
+		}
 	}
 
 	/* finally write the back reference in the inode */
@@ -981,25 +1147,55 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay.  So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found.  If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct inode *inode)
+static int count_inode_extrefs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret = 0;
+	int name_len;
+	unsigned int nlink = 0;
+	u32 item_size;
+	u32 cur_offset = 0;
+	u64 inode_objectid = btrfs_ino(inode);
+	u64 offset = 0;
+	unsigned long ptr;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+
+	while (1) {
+		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+					    &extref, &offset);
+		if (ret)
+			break;
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+			name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			nlink++;
+
+			cur_offset += name_len + sizeof(*extref);
+		}
+
+		offset++;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	if (ret < 0)
+		return ret;
+	return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
 {
-	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	u64 nlink = 0;
+	unsigned int nlink = 0;
 	unsigned long ptr;
 	unsigned long ptr_end;
 	int name_len;
@@ -1009,10 +1205,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -1046,6 +1238,50 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		btrfs_release_path(path);
 	}
 	btrfs_release_path(path);
+
+	return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	u64 nlink = 0;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = count_inode_refs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink = ret;
+
+	ret = count_inode_extrefs(root, inode, path);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	if (ret < 0)
+		goto out;
+
+	nlink += ret;
+
+	ret = 0;
+
 	if (nlink != inode->i_nlink) {
 		set_nlink(inode, nlink);
 		btrfs_update_inode(trans, root, inode);
@@ -1061,9 +1297,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		ret = insert_orphan_item(trans, root, ino);
 		BUG_ON(ret);
 	}
-	btrfs_free_path(path);
 
-	return 0;
+out:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1710,6 +1947,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = add_inode_ref(wc->trans, root, log, path,
 					    eb, i, &key);
 			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_INODE_EXTREF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);

From d24bec3ae528a47149b838aad76c006d40fe8a39 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Wed, 8 Aug 2012 11:33:54 -0700
Subject: [PATCH 095/121] btrfs: extended inode ref iteration

The iterate_irefs in backref.c is used to build path components from inode
refs. This patch adds code to iterate extended refs as well.

I had modify the callback function signature to abstract out some of the
differences between ref structures. iref_to_path() also needed similar
changes.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/backref.c | 173 ++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/backref.h |   2 -
 2 files changed, 138 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index dc963121d1db..f3187938e081 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1177,26 +1177,12 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 	return ret;
 }
 
-/*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
- * 0-terminated. the path is only given within the current file system.
- * Therefore, it never starts with a '/'. the caller is responsible to provide
- * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
- * the start point of the resulting string is returned. this pointer is within
- * dest, normally.
- * in case the path buffer would overflow, the pointer is decremented further
- * as if output was written to the buffer, though no more output is actually
- * generated. that way, the caller can determine how much space would be
- * required for the path to fit into the buffer. in that case, the returned
- * value will be smaller than dest. callers must check this!
- */
-char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-			 struct btrfs_inode_ref *iref,
+static char *ref_to_path(struct btrfs_root *fs_root,
+			 struct btrfs_path *path,
+			 u32 name_len, unsigned long name_off,
 			 struct extent_buffer *eb_in, u64 parent,
 			 char *dest, u32 size)
 {
-	u32 len;
 	int slot;
 	u64 next_inum;
 	int ret;
@@ -1204,17 +1190,17 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
 	int leave_spinning = path->leave_spinning;
+	struct btrfs_inode_ref *iref;
 
 	if (bytes_left >= 0)
 		dest[bytes_left] = '\0';
 
 	path->leave_spinning = 1;
 	while (1) {
-		len = btrfs_inode_ref_name_len(eb, iref);
-		bytes_left -= len;
+		bytes_left -= name_len;
 		if (bytes_left >= 0)
 			read_extent_buffer(eb, dest + bytes_left,
-						(unsigned long)(iref + 1), len);
+					   name_off, name_len);
 		if (eb != eb_in) {
 			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
@@ -1224,6 +1210,7 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			ret = -ENOENT;
 		if (ret)
 			break;
+
 		next_inum = found_key.offset;
 
 		/* regular exit ahead */
@@ -1239,8 +1226,11 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		}
 		btrfs_release_path(path);
-
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		name_len = btrfs_inode_ref_name_len(eb, iref);
+		name_off = (unsigned long)(iref + 1);
+
 		parent = next_inum;
 		--bytes_left;
 		if (bytes_left >= 0)
@@ -1256,6 +1246,32 @@ char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 	return dest + bytes_left;
 }
 
+/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+char *btrfs_iref_to_path(struct btrfs_root *fs_root,
+			 struct btrfs_path *path,
+			 struct btrfs_inode_ref *iref,
+			 struct extent_buffer *eb_in, u64 parent,
+			 char *dest, u32 size)
+{
+	return ref_to_path(fs_root, path,
+			   btrfs_inode_ref_name_len(eb_in, iref),
+			   (unsigned long)(iref + 1),
+			   eb_in, parent, dest, size);
+}
+
 /*
  * this makes the path point to (logical EXTENT_ITEM *)
  * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
@@ -1529,9 +1545,12 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
-				struct btrfs_path *path,
-				iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+			      struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+			      struct btrfs_path *path,
+			      iterate_irefs_t *iterate, void *ctx)
 {
 	int ret = 0;
 	int slot;
@@ -1548,7 +1567,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	while (!ret) {
 		path->leave_spinning = 1;
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-					&found_key);
+				     &found_key);
 		if (ret < 0)
 			break;
 		if (ret) {
@@ -1576,7 +1595,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 				 "tree %llu\n", cur,
 				 (unsigned long long)found_key.objectid,
 				 (unsigned long long)fs_root->objectid);
-			ret = iterate(parent, iref, eb, ctx);
+			ret = iterate(parent, name_len,
+				      (unsigned long)(iref + 1), eb, ctx);
 			if (ret)
 				break;
 			len = sizeof(*iref) + name_len;
@@ -1591,12 +1611,98 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	return ret;
 }
 
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+				 struct btrfs_path *path,
+				 iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u64 offset = 0;
+	u64 parent;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	u32 cur_offset;
+	unsigned long ptr;
+
+	while (1) {
+		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+					    &offset);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		atomic_inc(&eb->refs);
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			u32 name_len;
+
+			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+			parent = btrfs_inode_extref_parent(eb, extref);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			ret = iterate(parent, name_len,
+				      (unsigned long)&extref->name, eb, ctx);
+			if (ret)
+				break;
+
+			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += sizeof(*extref);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+
+		offset++;
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+			 struct btrfs_path *path, iterate_irefs_t *iterate,
+			 void *ctx)
+{
+	int ret;
+	int found_refs = 0;
+
+	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+	if (!ret)
+		++found_refs;
+	else if (ret != -ENOENT)
+		return ret;
+
+	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+	if (ret == -ENOENT && found_refs)
+		return 0;
+
+	return ret;
+}
+
 /*
  * returns 0 if the path could be dumped (probably truncated)
  * returns <0 in case of an error
  */
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+			 struct extent_buffer *eb, void *ctx)
 {
 	struct inode_fs_paths *ipath = ctx;
 	char *fspath;
@@ -1609,20 +1715,17 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 					ipath->fspath->bytes_left - s_ptr : 0;
 
 	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-	fspath = btrfs_iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
-				inum, fspath_min, bytes_left);
+	fspath = ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+			     name_off, eb, inum, fspath_min,
+			     bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		pr_debug("path resolved: %s\n", fspath);
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
-		pr_debug("missed path, not enough space. missing bytes: %lu, "
-			 "constructed so far: %s\n",
-			 (unsigned long)(fspath_min - fspath), fspath_min);
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
@@ -1644,7 +1747,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
 	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
-				inode_to_path, ipath);
+			     inode_to_path, ipath);
 }
 
 struct btrfs_data_container *init_data_container(u32 total_bytes)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0b920c113952..e75533043a5f 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -33,8 +33,6 @@ struct inode_fs_paths {
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
 		void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx);
 
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path);

From e8830e606ffee383f073e32313f11fc5692813fe Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Sep 2012 22:14:29 -0600
Subject: [PATCH 096/121] Btrfs: fix memory leak in start_transaction()

This patch fixes memory leak of the transaction handle which happened
when starting transaction failed on a freezed fs.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/transaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 910ff8051ba9..35489644c247 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -351,8 +351,10 @@ again:
 	 */
 	if (type != TRANS_JOIN_NOLOCK &&
 	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
-		if (type == TRANS_JOIN_FREEZE)
+		if (type == TRANS_JOIN_FREEZE) {
+			kmem_cache_free(btrfs_trans_handle_cachep, h);
 			return ERR_PTR(-EPERM);
+		}
 		sb_start_intwrite(root->fs_info->sb);
 	}
 

From a698d0755adb6f27289d1e6610b2240595d27e8c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Sep 2012 01:51:59 -0600
Subject: [PATCH 097/121] Btrfs: add a type field for the transaction handle

This patch add a type field into the transaction handle structure,
in this way, we needn't implement various end-transaction functions
and can make the code more simple and readable.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/inode.c       | 19 ++++---------------
 fs/btrfs/transaction.c | 29 ++++++-----------------------
 fs/btrfs/transaction.h | 15 +++++++++++----
 3 files changed, 21 insertions(+), 42 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 596305e4d75b..e355eb0aea1e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1363,11 +1363,7 @@ out_check:
 	}
 
 error:
-	if (nolock) {
-		err = btrfs_end_transaction_nolock(trans, root);
-	} else {
-		err = btrfs_end_transaction(trans, root);
-	}
+	err = btrfs_end_transaction(trans, root);
 	if (!ret)
 		ret = err;
 
@@ -1957,12 +1953,8 @@ out_unlock:
 out:
 	if (root != root->fs_info->tree_root)
 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (trans) {
-		if (nolock)
-			btrfs_end_transaction_nolock(trans, root);
-		else
-			btrfs_end_transaction(trans, root);
-	}
+	if (trans)
+		btrfs_end_transaction(trans, root);
 
 	if (ret)
 		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
@@ -4524,10 +4516,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-		if (nolock)
-			ret = btrfs_end_transaction_nolock(trans, root);
-		else
-			ret = btrfs_commit_transaction(trans, root);
+		ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 35489644c247..d0a2b7e49381 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -267,14 +267,6 @@ static void wait_current_trans(struct btrfs_root *root)
 	}
 }
 
-enum btrfs_trans_type {
-	TRANS_START,
-	TRANS_JOIN,
-	TRANS_USERSPACE,
-	TRANS_JOIN_NOLOCK,
-	TRANS_JOIN_FREEZE,
-};
-
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
 	if (root->fs_info->log_root_recovering)
@@ -388,6 +380,7 @@ again:
 	h->aborted = 0;
 	h->qgroup_reserved = qgroup_reserved;
 	h->delayed_ref_elem.seq = 0;
+	h->type = type;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
@@ -540,11 +533,12 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle, int lock)
+			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
 	int count = 0;
+	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
 
 	if (--trans->use_count) {
@@ -645,7 +639,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_end_transaction(trans, root, 0, 1);
+	ret = __btrfs_end_transaction(trans, root, 0);
 	if (ret)
 		return ret;
 	return 0;
@@ -656,18 +650,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	ret = __btrfs_end_transaction(trans, root, 1, 1);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root)
-{
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 0, 0);
+	ret = __btrfs_end_transaction(trans, root, 1);
 	if (ret)
 		return ret;
 	return 0;
@@ -676,7 +659,7 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1, 1);
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 /*
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fbf8313b9d67..0630bd19396a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,14 @@ struct btrfs_transaction {
 	int aborted;
 };
 
+enum btrfs_trans_type {
+	TRANS_START,
+	TRANS_JOIN,
+	TRANS_USERSPACE,
+	TRANS_JOIN_NOLOCK,
+	TRANS_JOIN_FREEZE,
+};
+
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
@@ -58,8 +66,9 @@ struct btrfs_trans_handle {
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
-	int aborted;
-	int adding_csums;
+	short aborted;
+	short adding_csums;
+	enum btrfs_trans_type type;
 	/*
 	 * this root is only needed to validate that the root passed to
 	 * start_transaction is the same as the one passed to end_transaction.
@@ -94,8 +103,6 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
 struct btrfs_trans_handle *btrfs_start_transaction_noflush(

From 354aa0fb6d5b97b262e056f7ad7bfc88d7ce0004 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Sep 2012 01:54:00 -0600
Subject: [PATCH 098/121] Btrfs: fix orphan transaction on the freezed
 filesystem

With the following debug patch:

 static int btrfs_freeze(struct super_block *sb)
 {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_transaction *trans;
+
+	spin_lock(&fs_info->trans_lock);
+	trans = fs_info->running_transaction;
+	if (trans) {
+		printk("Transid %llu, use_count %d, num_writer %d\n",
+			trans->transid, atomic_read(&trans->use_count),
+			atomic_read(&trans->num_writers));
+	}
+	spin_unlock(&fs_info->trans_lock);
 	return 0;
 }

I found there was a orphan transaction after the freeze operation was done.

It is because the transaction may not be committed when the transaction handle
end even though it is the last handle of the current transaction. This design
avoid committing the transaction frequently, but also introduce the above
problem.

So I add btrfs_attach_transaction() which can catch the current transaction
and commit it. If there is no transaction, it will return ENOENT, and do not
anything.

This function also can be used to instead of btrfs_join_transaction_freeze()
because it don't increase the writer counter and don't start a new transaction,
so it also can fix the deadlock between sync and freeze.

Besides that, it is used to instead of btrfs_join_transaction() in
transaction_kthread(), because if there is no transaction, the transaction
kthread needn't anything.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c     |  5 +++--
 fs/btrfs/super.c       | 18 +++++++++++++----
 fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++--------------
 fs/btrfs/transaction.h |  4 ++--
 4 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2c18a4eb4d5a..dcaf55695e6f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1657,9 +1657,10 @@ static int transaction_kthread(void *arg)
 		spin_unlock(&root->fs_info->trans_lock);
 
 		/* If the file system is aborted, this will always fail. */
-		trans = btrfs_join_transaction(root);
+		trans = btrfs_attach_transaction(root);
 		if (IS_ERR(trans)) {
-			cannot_commit = true;
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
 			goto sleep;
 		}
 		if (transid == trans->transid) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fb260bbf59c6..f8b803f30ed8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -854,10 +854,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	btrfs_wait_ordered_extents(root, 0);
 
-	trans = btrfs_join_transaction_freeze(root);
+	trans = btrfs_attach_transaction(root);
 	if (IS_ERR(trans)) {
-		/* Frozen, don't bother */
-		if (PTR_ERR(trans) == -EPERM)
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
 			return 0;
 		return PTR_ERR(trans);
 	}
@@ -1511,7 +1511,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	return 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d0a2b7e49381..69139a356f71 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -53,7 +53,7 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, int type)
 {
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -67,7 +67,13 @@ loop:
 	}
 
 	if (fs_info->trans_no_join) {
-		if (!nofail) {
+		/* 
+		 * If we are JOIN_NOLOCK we're already committing a current
+		 * transaction, we just need a handle to deal with something
+		 * when committing the transaction, such as inode cache and
+		 * space cache. It is a special case.
+		 */
+		if (type != TRANS_JOIN_NOLOCK) {
 			spin_unlock(&fs_info->trans_lock);
 			return -EBUSY;
 		}
@@ -87,6 +93,13 @@ loop:
 	}
 	spin_unlock(&fs_info->trans_lock);
 
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
@@ -340,27 +353,28 @@ again:
 	 * because we're already holding a ref.  We need this because we could
 	 * have raced in and did an fsync() on a file which can kick a commit
 	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
 	 */
-	if (type != TRANS_JOIN_NOLOCK &&
-	    !__sb_start_write(root->fs_info->sb, SB_FREEZE_FS, false)) {
-		if (type == TRANS_JOIN_FREEZE) {
-			kmem_cache_free(btrfs_trans_handle_cachep, h);
-			return ERR_PTR(-EPERM);
-		}
+	if (type < TRANS_JOIN_NOLOCK)
 		sb_start_intwrite(root->fs_info->sb);
-	}
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
 	do {
-		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+		ret = join_transaction(root, type);
 		if (ret == -EBUSY)
 			wait_current_trans(root);
 	} while (ret == -EBUSY);
 
 	if (ret < 0) {
-		sb_end_intwrite(root->fs_info->sb);
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+
+		if (type < TRANS_JOIN_NOLOCK)
+			sb_end_intwrite(root->fs_info->sb);
 		kmem_cache_free(btrfs_trans_handle_cachep, h);
 		return ERR_PTR(ret);
 	}
@@ -432,9 +446,9 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
 	return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
 
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root)
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_FREEZE, 0);
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
 
 /* wait for a transaction commit to be fully complete */
@@ -605,7 +619,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	if (lock)
+	if (trans->type < TRANS_JOIN_NOLOCK)
 		sb_end_intwrite(root->fs_info->sb);
 
 	WARN_ON(cur_trans != info->running_transaction);
@@ -1678,7 +1692,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	sb_end_intwrite(root->fs_info->sb);
+	if (trans->type < TRANS_JOIN_NOLOCK)
+		sb_end_intwrite(root->fs_info->sb);
 
 	trace_btrfs_transaction_commit(root);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0630bd19396a..80961947a6b2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -52,7 +52,7 @@ enum btrfs_trans_type {
 	TRANS_JOIN,
 	TRANS_USERSPACE,
 	TRANS_JOIN_NOLOCK,
-	TRANS_JOIN_FREEZE,
+	TRANS_ATTACH,
 };
 
 struct btrfs_trans_handle {
@@ -109,7 +109,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_noflush(
 					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_freeze(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,

From 67b0fd63d5466bb710677afd3d2b77926c55f662 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 24 Sep 2012 13:42:00 -0400
Subject: [PATCH 099/121] Btrfs: run delayed refs first when out of space

Running delayed refs is faster than running delalloc, so lets do that first
to try and reclaim space.  This makes my fs_mark test about 20% faster.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 395e222e39ab..3270b1087850 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3817,10 +3817,10 @@ commit:
 }
 
 enum flush_state {
-	FLUSH_DELALLOC		=	1,
-	FLUSH_DELALLOC_WAIT	=	2,
-	FLUSH_DELAYED_ITEMS_NR	=	3,
-	FLUSH_DELAYED_ITEMS	=	4,
+	FLUSH_DELAYED_ITEMS_NR	=	1,
+	FLUSH_DELAYED_ITEMS	=	2,
+	FLUSH_DELALLOC		=	3,
+	FLUSH_DELALLOC_WAIT	=	4,
 	ALLOC_CHUNK		=	5,
 	COMMIT_TRANS		=	6,
 };
@@ -3834,11 +3834,6 @@ static int flush_space(struct btrfs_root *root,
 	int ret = 0;
 
 	switch (state) {
-	case FLUSH_DELALLOC:
-	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(root, num_bytes, orig_bytes,
-				state == FLUSH_DELALLOC_WAIT);
-		break;
 	case FLUSH_DELAYED_ITEMS_NR:
 	case FLUSH_DELAYED_ITEMS:
 		if (state == FLUSH_DELAYED_ITEMS_NR) {
@@ -3859,6 +3854,11 @@ static int flush_space(struct btrfs_root *root,
 		ret = btrfs_run_delayed_items_nr(trans, root, nr);
 		btrfs_end_transaction(trans, root);
 		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(root, num_bytes, orig_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
 	case ALLOC_CHUNK:
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
@@ -3903,7 +3903,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 used;
 	u64 num_bytes = orig_bytes;
-	int flush_state = FLUSH_DELALLOC;
+	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
 	bool flushing = false;
 	bool committed = false;

From 221b831835421f9451182611fa25fa60f440662f Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@zabbo.net>
Date: Thu, 20 Sep 2012 14:33:00 -0600
Subject: [PATCH 100/121] btrfs: fix min csum item size warnings in 32bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 7ca4be45a0255ac8f08c05491c6add2dd87dd4f8 limited csum items to
PAGE_CACHE_SIZE.  It used min() with incompatible types in 32bit which
generates warnings:

fs/btrfs/file-item.c: In function ‘btrfs_csum_file_blocks’:
fs/btrfs/file-item.c:717: warning: comparison of distinct pointer types lacks a cast

This uses min_t(u32,) to fix the warnings.  u32 seemed reasonable
because btrfs_root->leafsize is u32 and PAGE_CACHE_SIZE is unsigned
long.

Signed-off-by: Zach Brown <zab@zabbo.net>
---
 fs/btrfs/file-item.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54f03354bed1..1ad08e4e4a15 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -29,7 +29,8 @@
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
-#define MAX_CSUM_ITEMS(r, size) (min(__MAX_CSUM_ITEMS(r, size), PAGE_CACHE_SIZE))
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+				       PAGE_CACHE_SIZE))
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \

From de0022b9da616b95ea5b41eab32da825b0b5150f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 14:25:58 -0400
Subject: [PATCH 101/121] Btrfs: do not async metadata csumming in certain
 situations

There are a coule scenarios where farming metadata csumming off to an async
thread doesn't help.  The first is if our processor supports crc32c, in
which case the csumming will be fast and so the overhead of the async model
is not worth the cost.  The other case is for our tree log.  We will be
making that stuff dirty and writing it out and waiting for it immediately.
Even with software crc32c this gives me a ~15% increase in speed with O_SYNC
workloads.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c   | 22 ++++++++++++++++++++++
 fs/btrfs/extent_io.c | 14 ++++++++++++--
 fs/btrfs/extent_io.h |  1 +
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dcaf55695e6f..aa02eab8c40b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,10 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
+
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -859,10 +863,22 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int async = check_async_write(inode, bio_flags);
 	int ret;
 
 	if (!(rw & REQ_WRITE)) {
@@ -877,6 +893,12 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			return ret;
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			return ret;
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
 	}
 
 	/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2c21570adf5..979fa0d6bfee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -45,6 +45,7 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -3163,12 +3164,16 @@ static int write_one_eb(struct extent_buffer *eb,
 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
 	u64 offset = eb->start;
 	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
 	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
 
@@ -3177,7 +3182,8 @@ static int write_one_eb(struct extent_buffer *eb,
 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
-					 0, 0, 0);
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
 		if (ret) {
 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
 			SetPageError(p);
@@ -3212,6 +3218,7 @@ int btree_write_cache_pages(struct address_space *mapping,
 		.tree = tree,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	int ret = 0;
 	int done = 0;
@@ -3474,7 +3481,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		if (epd->sync_io)
 			rw = WRITE_SYNC;
 
-		ret = submit_one_bio(rw, epd->bio, 0, 0);
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
@@ -3497,6 +3504,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
@@ -3521,6 +3529,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
@@ -3560,6 +3569,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 512f8da041f1..a69dea219044 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,6 +27,7 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */

From ce1953325662fa597197ce728e4195582fc21c8d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 15:26:16 -0400
Subject: [PATCH 102/121] Btrfs: do not hold the file extent leaf locked when
 adding extent item

For some reason we unlock everything except the leaf we are on, set the path
blocking and then add the extent item for the extent we just finished
writing.  I can't for the life of me figure out why we would want to do
this, and the history doesn't really indicate that there was a real reason
for it, so just remove it.  This will reduce our tree lock contention on
heavy writes.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e355eb0aea1e..d24b65014612 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1822,10 +1822,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
 
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_set_lock_blocking(leaf);
-
 	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
 
 	inode_add_bytes(inode, num_bytes);
 

From e6138876ad8327250d77291b3262fee356267211 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 27 Sep 2012 17:07:30 -0400
Subject: [PATCH 103/121] Btrfs: cache extent state when writing out dirty
 metadata pages

Everytime we write out dirty pages we search for an offset in the tree,
convert the bits in the state, and then when we wait we search for the
offset again and clear the bits.  So for every dirty range in the io tree we
are doing 4 rb searches, which is suboptimal.  With this patch we are only
doing 2 searches for every cycle (modulo weird things happening).  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/disk-io.c          |  4 ++--
 fs/btrfs/extent-tree.c      |  5 +++--
 fs/btrfs/extent_io.c        | 43 +++++++++++++++++++++++++++++++++++--
 fs/btrfs/extent_io.h        |  6 ++++--
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/relocation.c       |  2 +-
 fs/btrfs/transaction.c      | 14 +++++++-----
 fs/btrfs/tree-log.c         |  3 ++-
 8 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa02eab8c40b..c69995556f61 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3572,7 +3572,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
+					    mark, NULL);
 		if (ret)
 			break;
 
@@ -3627,7 +3627,7 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 again:
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3270b1087850..ca4aad96f814 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -312,7 +312,8 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE);
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
 		if (ret)
 			break;
 
@@ -5045,7 +5046,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 979fa0d6bfee..e8ee39b73356 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -937,6 +937,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * @end:	the end offset in bytes (inclusive)
  * @bits:	the bits to set in this range
  * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
  * @mask:	the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
@@ -946,7 +947,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask)
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -963,6 +965,15 @@ again:
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -993,6 +1004,7 @@ hit_next:
 	 */
 	if (state->start == start && state->end <= end) {
 		set_state_bits(tree, state, &bits);
+		cache_state(state, cached_state);
 		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
 			goto out;
@@ -1033,6 +1045,7 @@ hit_next:
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits);
+			cache_state(state, cached_state);
 			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
 				goto out;
@@ -1071,6 +1084,7 @@ hit_next:
 				   &bits);
 		if (err)
 			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
@@ -1093,6 +1107,7 @@ hit_next:
 			extent_io_tree_panic(tree, err);
 
 		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
 		prealloc = NULL;
 		goto out;
@@ -1297,18 +1312,42 @@ out:
  * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
+	struct rb_node *n;
 	int ret = 1;
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && state->tree) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
+
 	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
 	if (state) {
+		cache_state(state, cached_state);
 		*start_ret = state->start;
 		*end_ret = state->end;
 		ret = 0;
 	}
+out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a69dea219044..7aeb31087f88 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -233,13 +233,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask);
+		       int bits, int clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
 		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
+			  u64 *start_ret, u64 *end_ret, int bits,
+			  struct extent_state **cached_state);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b107e68797f4..1027b854b90c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -966,7 +966,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			       block_group->key.offset)) {
 		ret = find_first_extent_bit(unpin, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret) {
 			ret = 0;
 			break;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 6e530bb86c94..776f0aa128fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3621,7 +3621,7 @@ next:
 
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 
 		if (ret == 0 && start <= key.objectid) {
 			btrfs_release_path(path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 69139a356f71..77db875b5116 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -687,13 +687,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      mark)) {
-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-				   GFP_NOFS);
+				      mark, &cached_state)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				   mark, &cached_state, GFP_NOFS);
+		cached_state = NULL;
 		err = filemap_fdatawrite_range(mapping, start, end);
 		if (err)
 			werr = err;
@@ -717,12 +719,14 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      EXTENT_NEED_WAIT)) {
-		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				 0, 0, &cached_state, GFP_NOFS);
 		err = filemap_fdatawait_range(mapping, start, end);
 		if (err)
 			werr = err;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d7b34844323..f4b9e54b1da2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2463,7 +2463,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
 		if (ret)
 			break;
 

From 6f1fed775316d58ba356b2ce62de600ad00f003a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 26 Sep 2012 11:07:06 -0400
Subject: [PATCH 104/121] Btrfs: don't lookup csums for prealloc extents

The tree logging stuff was looking up csums to copy over for prealloc
extents which is just work we don't need to be doing.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/tree-log.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f4b9e54b1da2..31b46a9e94cc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3011,8 +3011,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				continue;
 
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				u64 ds, dl, cs, cl;
 				ds = btrfs_file_extent_disk_bytenr(src,
 								extent);

From 18ec90d63f43cf785f6e73e7a7db546bff3fa380 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 28 Sep 2012 11:56:28 -0400
Subject: [PATCH 105/121] Btrfs: be smarter about dropping things from the tree
 log

When we truncate existing items in the tree log we've been searching for
each individual item and removing them.  This is unnecessary churn and
searching, just keep track of the slot we are on and how many items we need
to delete and delete them all at once.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/tree-log.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 31b46a9e94cc..179fda964601 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2901,6 +2901,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	int start_slot;
 
 	key.objectid = objectid;
 	key.type = max_key_type;
@@ -2922,8 +2923,18 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 		if (found_key.objectid != objectid)
 			break;
 
-		ret = btrfs_del_item(trans, log, path);
-		if (ret)
+		found_key.offset = 0;
+		found_key.type = 0;
+		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+				       &start_slot);
+
+		ret = btrfs_del_items(trans, log, path, start_slot,
+				      path->slots[0] - start_slot + 1);
+		/*
+		 * If start slot isn't 0 then we don't need to re-search, we've
+		 * found the last guy with the objectid in this tree.
+		 */
+		if (ret || start_slot != 0)
 			break;
 		btrfs_release_path(path);
 	}

From f0bd95ea72644cbf9210606f6f8edfce13d65d47 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 1 Oct 2012 03:08:37 -0600
Subject: [PATCH 106/121] Btrfs: confirmation of value is added before
 trace_btrfs_get_extent() is called

We should confirm the value of extent_map before calling
trace_btrfs_get_extent() because the value of extent_map has the
possibility of NULL.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d24b65014612..f92def2467a6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5505,7 +5505,8 @@ insert:
 	write_unlock(&em_tree->lock);
 out:
 
-	trace_btrfs_get_extent(root, em);
+	if (em)
+		trace_btrfs_get_extent(root, em);
 
 	if (path)
 		btrfs_free_path(path);

From 44734ed1ca0c2e66479cedbf773627a9b5a31364 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 28 Sep 2012 16:04:19 -0400
Subject: [PATCH 107/121] Btrfs: don't commit instead of overcommitting

I don't think we have the same problem that this was supposed to fix
originally since we can allocate chunks in the enospc path now.  This code
is causing us to constantly commit the transaction as we get close to using
all of our available space in our currently allocated chunks, instead of
allocating another chunk and carrying on with life, which is not nice for
performance.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent-tree.c | 34 ++++++----------------------------
 1 file changed, 6 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ca4aad96f814..3d3e2c17d8d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3907,7 +3907,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
 	bool flushing = false;
-	bool committed = false;
 
 again:
 	ret = 0;
@@ -3970,33 +3969,12 @@ again:
 			(orig_bytes * 2);
 	}
 
-	if (ret) {
-		u64 avail;
-
-		/*
-		 * If we have a lot of space that's pinned, don't bother doing
-		 * the overcommit dance yet and just commit the transaction.
-		 */
-		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !committed) {
-			space_info->flush = 1;
-			flushing = true;
-			spin_unlock(&space_info->lock);
-			ret = may_commit_transaction(root, space_info,
-						     orig_bytes, 1);
-			if (ret)
-				goto out;
-			committed = true;
-			goto again;
-		}
-
-		if (can_overcommit(root, space_info, orig_bytes, flush)) {
-			space_info->bytes_may_use += orig_bytes;
-			trace_btrfs_space_reservation(root->fs_info,
-				"space_info", space_info->flags, orig_bytes, 1);
-			ret = 0;
-		}
+	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+		space_info->bytes_may_use += orig_bytes;
+		trace_btrfs_space_reservation(root->fs_info, "space_info",
+					      space_info->flags, orig_bytes,
+					      1);
+		ret = 0;
 	}
 
 	/*

From 8d1a1317af439cc7f349907b79198d35feffe179 Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sat, 29 Sep 2012 02:07:46 -0600
Subject: [PATCH 108/121] btrfs: remove unused function
 btrfs_insert_some_items()

The function btrfs_insert_some_items() would not be called by any other functions,
so remove it.

Signed-off-by: Robin Dong <sanbai@taobao.com>
---
 fs/btrfs/ctree.c | 143 -----------------------------------------------
 1 file changed, 143 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fcc8c21a6233..b33436211000 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4401,149 +4401,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
 	}
 }
 
-/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int i;
-	u32 nritems;
-	u32 total_data = 0;
-	u32 total_size = 0;
-	unsigned int data_end;
-	struct btrfs_disk_key disk_key;
-	struct btrfs_key found_key;
-	struct btrfs_map_token token;
-
-	btrfs_init_map_token(&token);
-
-	for (i = 0; i < nr; i++) {
-		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-		    BTRFS_LEAF_DATA_SIZE(root)) {
-			break;
-			nr = i;
-		}
-		total_data += data_size[i];
-		total_size += data_size[i] + sizeof(struct btrfs_item);
-	}
-	BUG_ON(nr == 0);
-
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-
-	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(root, leaf);
-
-	if (btrfs_leaf_free_space(root, leaf) < total_size) {
-		for (i = nr; i >= 0; i--) {
-			total_data -= data_size[i];
-			total_size -= data_size[i] + sizeof(struct btrfs_item);
-			if (total_size < btrfs_leaf_free_space(root, leaf))
-				break;
-		}
-		nr = i;
-	}
-
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
-	if (slot != nritems) {
-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
-		item = btrfs_item_nr(leaf, slot);
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		/* figure out how many keys we can insert in here */
-		total_data = data_size[0];
-		for (i = 1; i < nr; i++) {
-			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-				break;
-			total_data += data_size[i];
-		}
-		nr = i;
-
-		if (old_data < data_end) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-			       slot, old_data, data_end);
-			BUG_ON(1);
-		}
-		/*
-		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
-		 */
-		/* first correct the data pointers */
-		for (i = slot; i < nritems; i++) {
-			u32 ioff;
-
-			item = btrfs_item_nr(leaf, i);
-			ioff = btrfs_token_item_offset(leaf, item, &token);
-			btrfs_set_token_item_offset(leaf, item,
-						    ioff - total_data, &token);
-		}
-		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
-
-		/* shift the data */
-		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-			      data_end - total_data, btrfs_leaf_data(leaf) +
-			      data_end, old_data - data_end);
-		data_end = old_data;
-	} else {
-		/*
-		 * this sucks but it has to be done, if we are inserting at
-		 * the end of the leaf only insert 1 of the items, since we
-		 * have no way of knowing whats on the next leaf and we'd have
-		 * to drop our current locks to figure it out
-		 */
-		nr = 1;
-	}
-
-	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_token_item_offset(leaf, item,
-					    data_end - data_size[i], &token);
-		data_end -= data_size[i];
-		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
-	}
-	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
-
-	ret = 0;
-	if (slot == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		fixup_low_keys(trans, root, path, &disk_key, 1);
-	}
-
-	if (btrfs_leaf_free_space(root, leaf) < 0) {
-		btrfs_print_leaf(root, leaf);
-		BUG();
-	}
-out:
-	if (!ret)
-		ret = nr;
-	return ret;
-}
-
 /*
  * this is a helper for btrfs_insert_empty_items, the main goal here is
  * to save stack depth by doing the bulk of the work in a function

From 7a2d6a64645b38d7040bbd031c7a7b2655f5d976 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 1 Oct 2012 03:07:15 -0600
Subject: [PATCH 109/121] Btrfs: remove unnecessary IS_ERR in
 bio_readpage_error()

Because the value of extent_map is only a correct value or NULL,
so IS_ERR is unnecessary.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e8ee39b73356..67fe401c3209 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2110,7 +2110,7 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		}
 		read_unlock(&em_tree->lock);
 
-		if (!em || IS_ERR(em)) {
+		if (!em) {
 			kfree(failrec);
 			return -EIO;
 		}

From 479ed9abdbeec5d9ed0005f3bee9c9bc06a102bb Mon Sep 17 00:00:00 2001
From: Robin Dong <sanbai@taobao.com>
Date: Sat, 29 Sep 2012 02:07:47 -0600
Subject: [PATCH 110/121] btrfs: move inline function code to header file

When building btrfs from kernel code, it will report:

	fs/btrfs/extent_io.h:281: warning: 'extent_buffer_page' declared inline after being called
	fs/btrfs/extent_io.h:281: warning: previous declaration of 'extent_buffer_page' was here
	fs/btrfs/extent_io.h:280: warning: 'num_extent_pages' declared inline after being called
	fs/btrfs/extent_io.h:280: warning: previous declaration of 'num_extent_pages' was here

because of the wrong declaration of inline functions.

Signed-off-by: Robin Dong <sanbai@taobao.com>
---
 fs/btrfs/extent_io.c | 12 ------------
 fs/btrfs/extent_io.h | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 67fe401c3209..b82d244a2ef5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3986,18 +3986,6 @@ out:
 	return ret;
 }
 
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
-{
-	return eb->pages[i];
-}
-
-inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
 #if LEAK_DEBUG
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7aeb31087f88..711d12b80028 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -282,8 +282,18 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	return eb->pages[i];
+}
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {

From 94edf4ae43a5f9405fe3570f670e26ce1b188476 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 25 Sep 2012 14:56:25 -0400
Subject: [PATCH 111/121] Btrfs: don't bother committing delayed inode updates
 when fsyncing

We can just copy the in memory inode into the tree log directly, no sense in
updating the fs tree so we can copy it into the tree log tree.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/tree-log.c | 84 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 179fda964601..47911fd18310 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2944,6 +2944,55 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode, int log_inode_only)
+{
+	btrfs_set_inode_uid(leaf, item, inode->i_uid);
+	btrfs_set_inode_gid(leaf, item, inode->i_gid);
+	btrfs_set_inode_mode(leaf, item, inode->i_mode);
+	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+			       inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+				inode->i_atime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+			       inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				inode->i_mtime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+			       inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				inode->i_ctime.tv_nsec);
+
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+
+	btrfs_set_inode_sequence(leaf, item, inode->i_version);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
+	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+	btrfs_set_inode_block_group(leaf, item, 0);
+
+	if (log_inode_only) {
+		/* set the generation to zero so the recover code
+		 * can tell the difference between an logging
+		 * just to say 'this inode exists' and a logging
+		 * to say 'update this inode with these values'
+		 */
+		btrfs_set_inode_generation(leaf, item, 0);
+		btrfs_set_inode_size(leaf, item, 0);
+	} else {
+		btrfs_set_inode_generation(leaf, item,
+					   BTRFS_I(inode)->generation);
+		btrfs_set_inode_size(leaf, item, inode->i_size);
+	}
+
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode,
 			       struct btrfs_path *dst_path,
@@ -2990,24 +3039,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, ins_sizes[i]);
-
-		if (inode_only == LOG_INODE_EXISTS &&
-		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
 						    dst_path->slots[0],
 						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
+			fill_inode_item(trans, dst_path->nodes[0], inode_item,
+					inode, inode_only == LOG_INODE_EXISTS);
+		} else {
+			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+					   src_offset, ins_sizes[i]);
 		}
+
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
@@ -3361,11 +3403,15 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	ret = btrfs_commit_inode_delayed_items(trans, inode);
-	if (ret) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		return ret;
+	/* Only run delayed items if we are a dir or a new file */
+	if (S_ISDIR(inode->i_mode) ||
+	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+		ret = btrfs_commit_inode_delayed_items(trans, inode);
+		if (ret) {
+			btrfs_free_path(path);
+			btrfs_free_path(dst_path);
+			return ret;
+		}
 	}
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);

From 489406626c42c0176fddae2182d33be2cfb9840c Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Mon, 7 May 2012 06:35:38 -0600
Subject: [PATCH 112/121] btrfs: fix message printing

Fix various messages to include newline and module prefix.

Signed-off-by: Daniel J Blueman <daniel@quora.org>
---
 fs/btrfs/super.c   | 8 ++++----
 fs/btrfs/volumes.c | 6 +++---
 fs/btrfs/zlib.c    | 8 ++++----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8b803f30ed8..0fadcdba8ce4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -243,7 +243,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
 {
-	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+	WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n");
 	trans->aborted = errno;
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
@@ -549,11 +549,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
 		case Opt_defrag:
-			printk(KERN_INFO "btrfs: enabling auto defrag");
+			printk(KERN_INFO "btrfs: enabling auto defrag\n");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
 		case Opt_recovery:
-			printk(KERN_INFO "btrfs: enabling auto recovery");
+			printk(KERN_INFO "btrfs: enabling auto recovery\n");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
 		case Opt_skip_balance:
@@ -1602,7 +1602,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "misc_deregister failed for control device");
+		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4f4de51b5bdd..dfe5e3a22f55 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -639,7 +639,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = blkdev_get_by_path(device->name->str, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk(KERN_INFO "open %s failed\n", device->name->str);
+			printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
 			goto error;
 		}
 		filemap_write_and_wait(bdev->bd_inode->i_mapping);
@@ -3769,7 +3769,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n",
 		       (unsigned long long)logical,
 		       (unsigned long long)*length);
 		BUG();
@@ -4226,7 +4226,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = bbio->num_stripes;
 	if (map_length < length) {
-		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
 		       "len %llu\n", (unsigned long long)logical,
 		       (unsigned long long)length,
 		       (unsigned long long)map_length);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 92c20654cc55..9acb846c3e7f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	*total_in = 0;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-		printk(KERN_WARNING "deflateInit failed\n");
+		printk(KERN_WARNING "btrfs: deflateInit failed\n");
 		ret = -1;
 		goto out;
 	}
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
 			       ret);
 			zlib_deflateEnd(&workspace->def_strm);
 			ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 

From bedb2cca7252d08c6ca3085826e30f65bdc3b54b Mon Sep 17 00:00:00 2001
From: Andrei Popa <andrei.popa@i-neo.ro>
Date: Thu, 20 Sep 2012 08:42:11 -0600
Subject: [PATCH 113/121] Btrfs: make compress and nodatacow mount options
 mutually exclusive

If a filesystem is mounted with compression and then remounted by adding nodatacow,
the compression is disabled but the compress flag is still visible.
Also, if a filesystem is mounted with nodatacow and then remounted with compression,
nodatacow flag is still present but it's not active.
This patch:
- removes compress flags and notifies that the compression has been disabled if the
  filesystem is mounted with nodatacow
- removes nodatacow and nodatasum flags if mounted with compress.

Signed-off-by: Andrei Popa <andrei.popa@i-neo.ro>
---
 fs/btrfs/super.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0fadcdba8ce4..915ac14c2064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -413,7 +413,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			if (!btrfs_test_opt(root, COMPRESS) ||
+				!btrfs_test_opt(root, FORCE_COMPRESS)) {
+					printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+			} else {
+				printk(KERN_INFO "btrfs: setting nodatacow\n");
+			}
+			info->compress_type = BTRFS_COMPRESS_NONE;
+			btrfs_clear_opt(info->mount_opt, COMPRESS);
+			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
@@ -428,10 +436,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
 				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
 				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";

From 62856a9b73860cffe2a3d91b069393b88c219aa6 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 31 Jul 2012 11:09:44 -0600
Subject: [PATCH 114/121] Btrfs: detect corrupted filesystem after write I/O
 errors

In check-integrity, detect when a superblock is written that points
to blocks that have not been written to disk due to I/O write errors.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/check-integrity.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9197e2e33407..5a3e45db642a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -37,8 +37,9 @@
  *        the file system was mounted, (i.e., they have been
  *        referenced by the super block) or they have been
  *        written since then and the write completion callback
- *        was called and a FLUSH request to the device where
- *        these blocks are located was received and completed.
+ *        was called and no write error was indicated and a
+ *        FLUSH request to the device where these blocks are
+ *        located was received and completed.
  *    2b. All referenced blocks need to have a generation
  *        number which is equal to the parent's number.
  *
@@ -2601,6 +2602,17 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       (unsigned long long)l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 			ret = -1;
+		} else if (l->block_ref_to->iodone_w_error) {
+			printk(KERN_INFO "btrfs: attempt to write superblock"
+			       " which references block %c @%llu (%s/%llu/%d)"
+			       " which has write error!\n",
+			       btrfsic_get_block_type(state, l->block_ref_to),
+			       (unsigned long long)
+			       l->block_ref_to->logical_bytenr,
+			       l->block_ref_to->dev_state->name,
+			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num);
+			ret = -1;
 		} else if (l->parent_generation !=
 			   l->block_ref_to->generation &&
 			   BTRFSIC_GENERATION_UNKNOWN !=

From 5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Wed, 1 Aug 2012 18:56:49 +0200
Subject: [PATCH 115/121] Btrfs: make filesystem read-only when submitting
 barrier fails

So far the return code of barrier_all_devices() is ignored, which
means that errors are ignored. The result can be a corrupt
filesystem which is not consistent.
This commit adds code to evaluate the return code of
barrier_all_devices(). The normal btrfs_error() mechanism is used to
switch the filesystem into read-only mode when errors are detected.

In order to decide whether barrier_all_devices() should return
error or success, the number of disks that are allowed to fail the
barrier submission is calculated. This calculation accounts for the
worst RAID level of metadata, system and data. If single, dup or
RAID0 is in use, a single disk error is already considered to be
fatal. Otherwise a single disk error is tolerated.

The calculation of the number of disks that are tolerated to fail
the barrier operation is performed when the filesystem gets mounted,
when a balance operation is started and finished, and when devices
are added or removed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h    |   5 ++
 fs/btrfs/disk-io.c  | 109 ++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/disk-io.h  |   2 +
 fs/btrfs/ioctl.c    |   8 ++--
 fs/btrfs/tree-log.c |   7 ++-
 fs/btrfs/volumes.c  |  30 ++++++++++++
 6 files changed, 142 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50dcd0fbae11..1630be831210 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1468,6 +1468,8 @@ struct btrfs_fs_info {
 
 	/* next backup root to be overwritten */
 	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
 };
 
 /*
@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+
 /* file.c */
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c69995556f61..835523687707 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2505,6 +2505,8 @@ retry_root_backup:
 		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
 		goto fail_block_groups;
 	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 			printk_in_rcu("btrfs: disabling barriers on dev %s\n",
 				      rcu_str_deref(device->name));
 			device->nobarriers = 1;
-		}
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
-			if (!bio_flagged(bio, BIO_EOPNOTSUPP))
-				btrfs_dev_stat_inc_and_print(device,
-					BTRFS_DEV_STAT_FLUSH_ERRS);
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 
 		/* drop the reference from the wait == 0 run */
@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
-	int errors = 0;
+	int errors_send = 0;
+	int errors_wait = 0;
 	int ret;
 
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_send++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 0);
 		if (ret)
-			errors++;
+			errors_send++;
 	}
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_wait++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 		ret = write_dev_flush(dev, 1);
 		if (ret)
-			errors++;
+			errors_wait++;
 	}
-	if (errors)
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
 		return -EIO;
 	return 0;
 }
 
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1
+					 &&
+					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+						   BTRFS_BLOCK_GROUP_RAID10)))
+					num_tolerated_disk_barrier_failures = 1;
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
 
-	if (do_barriers)
-		barrier_all_devices(root->fs_info);
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
 
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     u64 objectid);
 int btree_lock_page_hook(struct page *page, void *data,
 				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d6836af6d60f..f5a2e6c4320a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-				 struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				get_block_group_info(&info->block_groups[c],
-						     &space);
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 47911fd18310..67eab2d4d8a9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * in and cause problems either.
 	 */
 	btrfs_scrub_pause_super(root);
-	write_ctree_super(trans, root->fs_info->tree_root, 1);
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	btrfs_scrub_continue_super(root);
-	ret = 0;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
 
 	mutex_lock(&root->log_mutex);
 	if (root->last_log_commit < log_transid)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dfe5e3a22f55..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		free_fs_devices(cur_devices);
 	}
 
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
 	/*
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_clear_space_info_full(root->fs_info);
 
 	unlock_chunks(root);
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
 	ret = btrfs_commit_transaction(trans, root);
 
 	if (seeding_dev) {
@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
 	ret = insert_balance_item(fs_info->tree_root, bctl);
 	if (ret && ret != -EEXIST)
 		goto out;
@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		__cancel_balance(fs_info);
 	}
 
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
 	wake_up(&fs_info->balance_wait_q);
 
 	return ret;

From 15e3004a0eb2c4879bc666d0e6a3acd973fb226c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 13:39:50 -0400
Subject: [PATCH 116/121] Btrfs: cleanup pages properly when ENOMEM in
 compression
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We were freeing non-existent pages which was causing a panic for a user who
was suffering from ENOMEM.  This patch fixes the problem.  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/compression.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43d1c5a3a030..c6467aa88bee 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -577,6 +577,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_start;
 	struct extent_map *em;
 	int ret = -ENOMEM;
+	int faili = 0;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -626,9 +627,13 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
-		if (!cb->compressed_pages[pg_index])
+		if (!cb->compressed_pages[pg_index]) {
+			faili = pg_index - 1;
+			ret = -ENOMEM;
 			goto fail2;
+		}
 	}
+	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
 	add_ra_bio_pages(inode, em_start + em_len, cb);
@@ -713,8 +718,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	return 0;
 
 fail2:
-	for (pg_index = 0; pg_index < nr_pages; pg_index++)
-		free_page((unsigned long)cb->compressed_pages[pg_index]);
+	while (faili >= 0) {
+		__free_page(cb->compressed_pages[faili]);
+		faili--;
+	}
 
 	kfree(cb->compressed_pages);
 fail1:

From edd33c99c4ba26ebe17c1a3d65b4aba25482ed32 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:40:32 -0400
Subject: [PATCH 117/121] Btrfs: don't bug on enomem in readpage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Get rid of the BUG_ON(ret == -ENOMEM) in __extent_read_full_page.  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent_io.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b82d244a2ef5..8c37cb64be7a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2751,12 +2751,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
-			BUG_ON(ret == -ENOMEM);
-			nr++;
-			*bio_flags = this_bio_flag;
+			if (!ret) {
+				nr++;
+				*bio_flags = this_bio_flag;
+			}
 		}
-		if (ret)
+		if (ret) {
 			SetPageError(page);
+			unlock_extent(tree, cur, cur + iosize - 1);
+		}
 		cur = cur + iosize;
 		pg_offset += iosize;
 	}

From 4804b38293c020e7a2c841e86402f456c19d934d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:43:45 -0400
Subject: [PATCH 118/121] Btrfs: do not warn_on when we cannot alloc a page for
 an extent buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's just annoying and the user will have gotten a nice OOM killer message
so they are already fully aware they are screwed :).  Thanks,

Reported-by: Jérôme Poulin <jeromepoulin@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent_io.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8c37cb64be7a..7dc69b38548d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4249,10 +4249,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!p) {
-			WARN_ON(1);
+		if (!p)
 			goto free_eb;
-		}
 
 		spin_lock(&mapping->private_lock);
 		if (PagePrivate(p)) {

From f60b1b49f6f72abb8bedfd49b758773bbda043c8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 5 Oct 2012 16:53:34 -0400
Subject: [PATCH 119/121] Btrfs: fix page leakage

Alloc_dummy_extent_buffer will not free the first page in the eb array if we
fail to allocate a page, fix this.  Thanks,

Reported-by: David Sterba <dave@jikos.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7dc69b38548d..64dc93f64bc0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4104,7 +4104,7 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 
 	return eb;
 err:
-	for (i--; i > 0; i--)
+	for (i--; i >= 0; i--)
 		__free_page(eb->pages[i]);
 	__free_extent_buffer(eb);
 	return NULL;

From 1037a5affc266fdfe08b2dd415d309ab2778ce6a Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Mon, 8 Oct 2012 07:26:15 -0600
Subject: [PATCH 120/121] Btrfs: remove repeated eb->pages check in,
 disk-io.c/csum_dirty_buffer

In csum_dirty_buffer, we first get eb from page->private.
Then we check if the page is the first page of eb. Later
we check it again. Remove the repeated check here.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
---
 fs/btrfs/disk-io.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 835523687707..7cda51995c1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -433,10 +433,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 		return 0;
 	}
-	if (eb->pages[0] != page) {
-		WARN_ON(1);
-		return 0;
-	}
 	if (!PageUptodate(page)) {
 		WARN_ON(1);
 		return 0;

From f46dbe3dee853f8a860f889cb2b7ff4c624f2a7a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@fusionio.com>
Date: Tue, 9 Oct 2012 11:17:20 -0400
Subject: [PATCH 121/121] btrfs: init ref_index to zero in add_inode_ref

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 67eab2d4d8a9..e9ebb472b28b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1033,7 +1033,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	int log_ref_ver = 0;
 	u64 parent_objectid;
 	u64 inode_objectid;
-	u64 ref_index;
+	u64 ref_index = 0;
 	int ref_struct_size;
 
 	ref_ptr = btrfs_item_ptr_offset(eb, slot);