From e3e39c72b99f93bbd0420d38c858e7c4a061bb63 Mon Sep 17 00:00:00 2001
From: Marcos Paulo de Souza <mpdesouza@suse.com>
Date: Fri, 21 Aug 2020 11:54:44 -0300
Subject: [PATCH 1/7] btrfs: block-group: fix free-space bitmap threshold

[BUG]
After commit 9afc66498a0b ("btrfs: block-group: refactor how we read one
block group item"), cache->length is being assigned after calling
btrfs_create_block_group_cache. This causes a problem since
set_free_space_tree_thresholds calculates the free-space threshold to
decide if the free-space tree should convert from extents to bitmaps.

The current code calls set_free_space_tree_thresholds with cache->length
being 0, which then makes cache->bitmap_high_thresh zero. This implies
the system will always use bitmap instead of extents, which is not
desired if the block group is not fragmented.

This behavior can be seen by a test that expects to repair systems
with FREE_SPACE_EXTENT and FREE_SPACE_BITMAP, but the current code only
created FREE_SPACE_BITMAP.

[FIX]
Call set_free_space_tree_thresholds after setting cache->length. There
is now a WARN_ON in set_free_space_tree_thresholds to help preventing
the same mistake to happen again in the future.

Link: https://github.com/kdave/btrfs-progs/issues/251
Fixes: 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item")
CC: stable@vger.kernel.org # 5.8+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c     | 4 +++-
 fs/btrfs/free-space-tree.c | 4 ++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 613920c17ac1..ea8aaf36647e 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1798,7 +1798,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
 
 	cache->fs_info = fs_info;
 	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
-	set_free_space_tree_thresholds(cache);
 
 	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
 
@@ -1912,6 +1911,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
 	if (ret < 0)
 		goto error;
 
+	set_free_space_tree_thresholds(cache);
+
 	if (need_clear) {
 		/*
 		 * When we mount with old space cache, we need to
@@ -2132,6 +2133,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
 		return -ENOMEM;
 
 	cache->length = size;
+	set_free_space_tree_thresholds(cache);
 	cache->used = bytes_used;
 	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 8b1f5c8897b7..6b9faf3b0e96 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
 	size_t bitmap_size;
 	u64 num_bitmaps, total_bitmap_size;
 
+	if (WARN_ON(cache->length == 0))
+		btrfs_warn(cache->fs_info, "block group %llu length is zero",
+			   cache->start);
+
 	/*
 	 * We convert to bitmaps when the disk space required for using extents
 	 * exceeds that required for using bitmaps.

From 9771a5cf937129307d9f58922d60484d58ababe7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 10 Aug 2020 11:42:26 -0400
Subject: [PATCH 2/7] btrfs: drop path before adding new uuid tree entry

With the conversion of the tree locks to rwsem I got the following
lockdep splat:

  ======================================================
  WARNING: possible circular locking dependency detected
  5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Not tainted
  ------------------------------------------------------
  btrfs-uuid/7955 is trying to acquire lock:
  ffff88bfbafec0f8 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  but task is already holding lock:
  ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #1 (btrfs-uuid-00){++++}-{3:3}:
	 down_read_nested+0x3e/0x140
	 __btrfs_tree_read_lock+0x39/0x180
	 __btrfs_read_lock_root_node+0x3a/0x50
	 btrfs_search_slot+0x4bd/0x990
	 btrfs_uuid_tree_add+0x89/0x2d0
	 btrfs_uuid_scan_kthread+0x330/0x390
	 kthread+0x133/0x150
	 ret_from_fork+0x1f/0x30

  -> #0 (btrfs-root-00){++++}-{3:3}:
	 __lock_acquire+0x1272/0x2310
	 lock_acquire+0x9e/0x360
	 down_read_nested+0x3e/0x140
	 __btrfs_tree_read_lock+0x39/0x180
	 __btrfs_read_lock_root_node+0x3a/0x50
	 btrfs_search_slot+0x4bd/0x990
	 btrfs_find_root+0x45/0x1b0
	 btrfs_read_tree_root+0x61/0x100
	 btrfs_get_root_ref.part.50+0x143/0x630
	 btrfs_uuid_tree_iterate+0x207/0x314
	 btrfs_uuid_rescan_kthread+0x12/0x50
	 kthread+0x133/0x150
	 ret_from_fork+0x1f/0x30

  other info that might help us debug this:

   Possible unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(btrfs-uuid-00);
				 lock(btrfs-root-00);
				 lock(btrfs-uuid-00);
    lock(btrfs-root-00);

   *** DEADLOCK ***

  1 lock held by btrfs-uuid/7955:
   #0: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  stack backtrace:
  CPU: 73 PID: 7955 Comm: btrfs-uuid Kdump: loaded Not tainted 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925
  Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018
  Call Trace:
   dump_stack+0x78/0xa0
   check_noncircular+0x165/0x180
   __lock_acquire+0x1272/0x2310
   lock_acquire+0x9e/0x360
   ? __btrfs_tree_read_lock+0x39/0x180
   ? btrfs_root_node+0x1c/0x1d0
   down_read_nested+0x3e/0x140
   ? __btrfs_tree_read_lock+0x39/0x180
   __btrfs_tree_read_lock+0x39/0x180
   __btrfs_read_lock_root_node+0x3a/0x50
   btrfs_search_slot+0x4bd/0x990
   btrfs_find_root+0x45/0x1b0
   btrfs_read_tree_root+0x61/0x100
   btrfs_get_root_ref.part.50+0x143/0x630
   btrfs_uuid_tree_iterate+0x207/0x314
   ? btree_readpage+0x20/0x20
   btrfs_uuid_rescan_kthread+0x12/0x50
   kthread+0x133/0x150
   ? kthread_create_on_node+0x60/0x60
   ret_from_fork+0x1f/0x30

This problem exists because we have two different rescan threads,
btrfs_uuid_scan_kthread which creates the uuid tree, and
btrfs_uuid_tree_iterate that goes through and updates or deletes any out
of date roots.  The problem is they both do things in different order.
btrfs_uuid_scan_kthread() reads the tree_root, and then inserts entries
into the uuid_root.  btrfs_uuid_tree_iterate() scans the uuid_root, but
then does a btrfs_get_fs_root() which can read from the tree_root.

It's actually easy enough to not be holding the path in
btrfs_uuid_scan_kthread() when we add a uuid entry, as we already drop
it further down and re-start the search when we loop.  So simply move
the path release before we add our entry to the uuid tree.

This also fixes a problem where we're holding a path open after we do
btrfs_end_transaction(), which has it's own problems.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ee96c5869f57..214856c4ccb1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4462,6 +4462,7 @@ int btrfs_uuid_scan_kthread(void *data)
 			goto skip;
 		}
 update_tree:
+		btrfs_release_path(path);
 		if (!btrfs_is_empty_uuid(root_item.uuid)) {
 			ret = btrfs_uuid_tree_add(trans, root_item.uuid,
 						  BTRFS_UUID_KEY_SUBVOL,
@@ -4486,6 +4487,7 @@ update_tree:
 		}
 
 skip:
+		btrfs_release_path(path);
 		if (trans) {
 			ret = btrfs_end_transaction(trans);
 			trans = NULL;
@@ -4493,7 +4495,6 @@ skip:
 				break;
 		}
 
-		btrfs_release_path(path);
 		if (key.offset < (u64)-1) {
 			key.offset++;
 		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {

From a48b73eca4ceb9b8a4b97f290a065335dbcd8a04 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 10 Aug 2020 11:42:27 -0400
Subject: [PATCH 3/7] btrfs: fix potential deadlock in the search ioctl

With the conversion of the tree locks to rwsem I got the following
lockdep splat:

  ======================================================
  WARNING: possible circular locking dependency detected
  5.8.0-rc7-00165-g04ec4da5f45f-dirty #922 Not tainted
  ------------------------------------------------------
  compsize/11122 is trying to acquire lock:
  ffff889fabca8768 (&mm->mmap_lock#2){++++}-{3:3}, at: __might_fault+0x3e/0x90

  but task is already holding lock:
  ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #2 (btrfs-fs-00){++++}-{3:3}:
	 down_write_nested+0x3b/0x70
	 __btrfs_tree_lock+0x24/0x120
	 btrfs_search_slot+0x756/0x990
	 btrfs_lookup_inode+0x3a/0xb4
	 __btrfs_update_delayed_inode+0x93/0x270
	 btrfs_async_run_delayed_root+0x168/0x230
	 btrfs_work_helper+0xd4/0x570
	 process_one_work+0x2ad/0x5f0
	 worker_thread+0x3a/0x3d0
	 kthread+0x133/0x150
	 ret_from_fork+0x1f/0x30

  -> #1 (&delayed_node->mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_delayed_update_inode+0x50/0x440
	 btrfs_update_inode+0x8a/0xf0
	 btrfs_dirty_inode+0x5b/0xd0
	 touch_atime+0xa1/0xd0
	 btrfs_file_mmap+0x3f/0x60
	 mmap_region+0x3a4/0x640
	 do_mmap+0x376/0x580
	 vm_mmap_pgoff+0xd5/0x120
	 ksys_mmap_pgoff+0x193/0x230
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #0 (&mm->mmap_lock#2){++++}-{3:3}:
	 __lock_acquire+0x1272/0x2310
	 lock_acquire+0x9e/0x360
	 __might_fault+0x68/0x90
	 _copy_to_user+0x1e/0x80
	 copy_to_sk.isra.32+0x121/0x300
	 search_ioctl+0x106/0x200
	 btrfs_ioctl_tree_search_v2+0x7b/0xf0
	 btrfs_ioctl+0x106f/0x30a0
	 ksys_ioctl+0x83/0xc0
	 __x64_sys_ioctl+0x16/0x20
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  other info that might help us debug this:

  Chain exists of:
    &mm->mmap_lock#2 --> &delayed_node->mutex --> btrfs-fs-00

   Possible unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(btrfs-fs-00);
				 lock(&delayed_node->mutex);
				 lock(btrfs-fs-00);
    lock(&mm->mmap_lock#2);

   *** DEADLOCK ***

  1 lock held by compsize/11122:
   #0: ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  stack backtrace:
  CPU: 17 PID: 11122 Comm: compsize Kdump: loaded Not tainted 5.8.0-rc7-00165-g04ec4da5f45f-dirty #922
  Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018
  Call Trace:
   dump_stack+0x78/0xa0
   check_noncircular+0x165/0x180
   __lock_acquire+0x1272/0x2310
   lock_acquire+0x9e/0x360
   ? __might_fault+0x3e/0x90
   ? find_held_lock+0x72/0x90
   __might_fault+0x68/0x90
   ? __might_fault+0x3e/0x90
   _copy_to_user+0x1e/0x80
   copy_to_sk.isra.32+0x121/0x300
   ? btrfs_search_forward+0x2a6/0x360
   search_ioctl+0x106/0x200
   btrfs_ioctl_tree_search_v2+0x7b/0xf0
   btrfs_ioctl+0x106f/0x30a0
   ? __do_sys_newfstat+0x5a/0x70
   ? ksys_ioctl+0x83/0xc0
   ksys_ioctl+0x83/0xc0
   __x64_sys_ioctl+0x16/0x20
   do_syscall_64+0x50/0x90
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

The problem is we're doing a copy_to_user() while holding tree locks,
which can deadlock if we have to do a page fault for the copy_to_user().
This exists even without my locking changes, so it needs to be fixed.
Rework the search ioctl to do the pre-fault and then
copy_to_user_nofault for the copying.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c |  8 ++++----
 fs/btrfs/extent_io.h |  6 +++---
 fs/btrfs/ioctl.c     | 27 ++++++++++++++++++++-------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 617ea38e6fd7..c15ab6c1897f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5653,9 +5653,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 	}
 }
 
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
-			       void __user *dstv,
-			       unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+				       void __user *dstv,
+				       unsigned long start, unsigned long len)
 {
 	size_t cur;
 	size_t offset;
@@ -5675,7 +5675,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
 
 		cur = min(len, (PAGE_SIZE - offset));
 		kaddr = page_address(page);
-		if (copy_to_user(dst, kaddr + offset, cur)) {
+		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
 			ret = -EFAULT;
 			break;
 		}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 00a88f2eb5ab..30794ae58498 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -241,9 +241,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 void read_extent_buffer(const struct extent_buffer *eb, void *dst,
 			unsigned long start,
 			unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
-			       void __user *dst, unsigned long start,
-			       unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+				       void __user *dst, unsigned long start,
+				       unsigned long len);
 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
 		const void *src);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd3511c5ca81..ac45f022b495 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2086,9 +2086,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
 		sh.len = item_len;
 		sh.transid = found_transid;
 
-		/* copy search result header */
-		if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
-			ret = -EFAULT;
+		/*
+		 * Copy search result header. If we fault then loop again so we
+		 * can fault in the pages and -EFAULT there if there's a
+		 * problem. Otherwise we'll fault and then copy the buffer in
+		 * properly this next time through
+		 */
+		if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+			ret = 0;
 			goto out;
 		}
 
@@ -2096,10 +2101,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
 
 		if (item_len) {
 			char __user *up = ubuf + *sk_offset;
-			/* copy the item */
-			if (read_extent_buffer_to_user(leaf, up,
-						       item_off, item_len)) {
-				ret = -EFAULT;
+			/*
+			 * Copy the item, same behavior as above, but reset the
+			 * * sk_offset so we copy the full thing again.
+			 */
+			if (read_extent_buffer_to_user_nofault(leaf, up,
+						item_off, item_len)) {
+				ret = 0;
+				*sk_offset -= sizeof(sh);
 				goto out;
 			}
 
@@ -2184,6 +2193,10 @@ static noinline int search_ioctl(struct inode *inode,
 	key.offset = sk->min_offset;
 
 	while (1) {
+		ret = fault_in_pages_writeable(ubuf, *buf_size - sk_offset);
+		if (ret)
+			break;
+
 		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)

From e89c4a9c8e6ce3a84cab4f342687d3fbbb1234eb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 10 Aug 2020 11:42:29 -0400
Subject: [PATCH 4/7] btrfs: allocate scrub workqueues outside of locks

I got the following lockdep splat while testing:

  ======================================================
  WARNING: possible circular locking dependency detected
  5.8.0-rc7-00172-g021118712e59 #932 Not tainted
  ------------------------------------------------------
  btrfs/229626 is trying to acquire lock:
  ffffffff828513f0 (cpu_hotplug_lock){++++}-{0:0}, at: alloc_workqueue+0x378/0x450

  but task is already holding lock:
  ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #7 (&fs_info->scrub_lock){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_scrub_dev+0x11c/0x630
	 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4
	 btrfs_ioctl+0x2799/0x30a0
	 ksys_ioctl+0x83/0xc0
	 __x64_sys_ioctl+0x16/0x20
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #6 (&fs_devs->device_list_mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_run_dev_stats+0x49/0x480
	 commit_cowonly_roots+0xb5/0x2a0
	 btrfs_commit_transaction+0x516/0xa60
	 sync_filesystem+0x6b/0x90
	 generic_shutdown_super+0x22/0x100
	 kill_anon_super+0xe/0x30
	 btrfs_kill_super+0x12/0x20
	 deactivate_locked_super+0x29/0x60
	 cleanup_mnt+0xb8/0x140
	 task_work_run+0x6d/0xb0
	 __prepare_exit_to_usermode+0x1cc/0x1e0
	 do_syscall_64+0x5c/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #5 (&fs_info->tree_log_mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_commit_transaction+0x4bb/0xa60
	 sync_filesystem+0x6b/0x90
	 generic_shutdown_super+0x22/0x100
	 kill_anon_super+0xe/0x30
	 btrfs_kill_super+0x12/0x20
	 deactivate_locked_super+0x29/0x60
	 cleanup_mnt+0xb8/0x140
	 task_work_run+0x6d/0xb0
	 __prepare_exit_to_usermode+0x1cc/0x1e0
	 do_syscall_64+0x5c/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #4 (&fs_info->reloc_mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_record_root_in_trans+0x43/0x70
	 start_transaction+0xd1/0x5d0
	 btrfs_dirty_inode+0x42/0xd0
	 touch_atime+0xa1/0xd0
	 btrfs_file_mmap+0x3f/0x60
	 mmap_region+0x3a4/0x640
	 do_mmap+0x376/0x580
	 vm_mmap_pgoff+0xd5/0x120
	 ksys_mmap_pgoff+0x193/0x230
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #3 (&mm->mmap_lock#2){++++}-{3:3}:
	 __might_fault+0x68/0x90
	 _copy_to_user+0x1e/0x80
	 perf_read+0x141/0x2c0
	 vfs_read+0xad/0x1b0
	 ksys_read+0x5f/0xe0
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #2 (&cpuctx_mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 perf_event_init_cpu+0x88/0x150
	 perf_event_init+0x1db/0x20b
	 start_kernel+0x3ae/0x53c
	 secondary_startup_64+0xa4/0xb0

  -> #1 (pmus_lock){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 perf_event_init_cpu+0x4f/0x150
	 cpuhp_invoke_callback+0xb1/0x900
	 _cpu_up.constprop.26+0x9f/0x130
	 cpu_up+0x7b/0xc0
	 bringup_nonboot_cpus+0x4f/0x60
	 smp_init+0x26/0x71
	 kernel_init_freeable+0x110/0x258
	 kernel_init+0xa/0x103
	 ret_from_fork+0x1f/0x30

  -> #0 (cpu_hotplug_lock){++++}-{0:0}:
	 __lock_acquire+0x1272/0x2310
	 lock_acquire+0x9e/0x360
	 cpus_read_lock+0x39/0xb0
	 alloc_workqueue+0x378/0x450
	 __btrfs_alloc_workqueue+0x15d/0x200
	 btrfs_alloc_workqueue+0x51/0x160
	 scrub_workers_get+0x5a/0x170
	 btrfs_scrub_dev+0x18c/0x630
	 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4
	 btrfs_ioctl+0x2799/0x30a0
	 ksys_ioctl+0x83/0xc0
	 __x64_sys_ioctl+0x16/0x20
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  other info that might help us debug this:

  Chain exists of:
    cpu_hotplug_lock --> &fs_devs->device_list_mutex --> &fs_info->scrub_lock

   Possible unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(&fs_info->scrub_lock);
				 lock(&fs_devs->device_list_mutex);
				 lock(&fs_info->scrub_lock);
    lock(cpu_hotplug_lock);

   *** DEADLOCK ***

  2 locks held by btrfs/229626:
   #0: ffff88bfe8bb86e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: btrfs_scrub_dev+0xbd/0x630
   #1: ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630

  stack backtrace:
  CPU: 15 PID: 229626 Comm: btrfs Kdump: loaded Not tainted 5.8.0-rc7-00172-g021118712e59 #932
  Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018
  Call Trace:
   dump_stack+0x78/0xa0
   check_noncircular+0x165/0x180
   __lock_acquire+0x1272/0x2310
   lock_acquire+0x9e/0x360
   ? alloc_workqueue+0x378/0x450
   cpus_read_lock+0x39/0xb0
   ? alloc_workqueue+0x378/0x450
   alloc_workqueue+0x378/0x450
   ? rcu_read_lock_sched_held+0x52/0x80
   __btrfs_alloc_workqueue+0x15d/0x200
   btrfs_alloc_workqueue+0x51/0x160
   scrub_workers_get+0x5a/0x170
   btrfs_scrub_dev+0x18c/0x630
   ? start_transaction+0xd1/0x5d0
   btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4
   btrfs_ioctl+0x2799/0x30a0
   ? do_sigaction+0x102/0x250
   ? lockdep_hardirqs_on_prepare+0xca/0x160
   ? _raw_spin_unlock_irq+0x24/0x30
   ? trace_hardirqs_on+0x1c/0xe0
   ? _raw_spin_unlock_irq+0x24/0x30
   ? do_sigaction+0x102/0x250
   ? ksys_ioctl+0x83/0xc0
   ksys_ioctl+0x83/0xc0
   __x64_sys_ioctl+0x16/0x20
   do_syscall_64+0x50/0x90
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

This happens because we're allocating the scrub workqueues under the
scrub and device list mutex, which brings in a whole host of other
dependencies.

Because the work queue allocation is done with GFP_KERNEL, it can
trigger reclaim, which can lead to a transaction commit, which in turns
needs the device_list_mutex, it can lead to a deadlock. A different
problem for which this fix is a solution.

Fix this by moving the actual allocation outside of the
scrub lock, and then only take the lock once we're ready to actually
assign them to the fs_info.  We'll now have to cleanup the workqueues in
a few more places, so I've added a helper to do the refcount dance to
safely free the workqueues.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 126 +++++++++++++++++++++++++++--------------------
 1 file changed, 72 insertions(+), 54 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a6cb9db512e..354ab9985a34 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 	return 0;
 }
 
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+					&fs_info->scrub_lock)) {
+		struct btrfs_workqueue *scrub_workers = NULL;
+		struct btrfs_workqueue *scrub_wr_comp = NULL;
+		struct btrfs_workqueue *scrub_parity = NULL;
+
+		scrub_workers = fs_info->scrub_workers;
+		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+		scrub_parity = fs_info->scrub_parity_workers;
+
+		fs_info->scrub_workers = NULL;
+		fs_info->scrub_wr_completion_workers = NULL;
+		fs_info->scrub_parity_workers = NULL;
+		mutex_unlock(&fs_info->scrub_lock);
+
+		btrfs_destroy_workqueue(scrub_workers);
+		btrfs_destroy_workqueue(scrub_wr_comp);
+		btrfs_destroy_workqueue(scrub_parity);
+	}
+}
+
 /*
  * get a reference count on fs_info->scrub_workers. start worker if necessary
  */
 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
+	struct btrfs_workqueue *scrub_workers = NULL;
+	struct btrfs_workqueue *scrub_wr_comp = NULL;
+	struct btrfs_workqueue *scrub_parity = NULL;
 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
+	int ret = -ENOMEM;
 
-	lockdep_assert_held(&fs_info->scrub_lock);
+	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+		return 0;
 
+	scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+					      is_dev_replace ? 1 : max_active, 4);
+	if (!scrub_workers)
+		goto fail_scrub_workers;
+
+	scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
+					      max_active, 2);
+	if (!scrub_wr_comp)
+		goto fail_scrub_wr_completion_workers;
+
+	scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+					     max_active, 2);
+	if (!scrub_parity)
+		goto fail_scrub_parity_workers;
+
+	mutex_lock(&fs_info->scrub_lock);
 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
-		ASSERT(fs_info->scrub_workers == NULL);
-		fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
-				flags, is_dev_replace ? 1 : max_active, 4);
-		if (!fs_info->scrub_workers)
-			goto fail_scrub_workers;
-
-		ASSERT(fs_info->scrub_wr_completion_workers == NULL);
-		fs_info->scrub_wr_completion_workers =
-			btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
-					      max_active, 2);
-		if (!fs_info->scrub_wr_completion_workers)
-			goto fail_scrub_wr_completion_workers;
-
-		ASSERT(fs_info->scrub_parity_workers == NULL);
-		fs_info->scrub_parity_workers =
-			btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
-					      max_active, 2);
-		if (!fs_info->scrub_parity_workers)
-			goto fail_scrub_parity_workers;
-
+		ASSERT(fs_info->scrub_workers == NULL &&
+		       fs_info->scrub_wr_completion_workers == NULL &&
+		       fs_info->scrub_parity_workers == NULL);
+		fs_info->scrub_workers = scrub_workers;
+		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+		fs_info->scrub_parity_workers = scrub_parity;
 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
-	} else {
-		refcount_inc(&fs_info->scrub_workers_refcnt);
+		mutex_unlock(&fs_info->scrub_lock);
+		return 0;
 	}
-	return 0;
+	/* Other thread raced in and created the workers for us */
+	refcount_inc(&fs_info->scrub_workers_refcnt);
+	mutex_unlock(&fs_info->scrub_lock);
 
+	ret = 0;
+	btrfs_destroy_workqueue(scrub_parity);
 fail_scrub_parity_workers:
-	btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+	btrfs_destroy_workqueue(scrub_wr_comp);
 fail_scrub_wr_completion_workers:
-	btrfs_destroy_workqueue(fs_info->scrub_workers);
+	btrfs_destroy_workqueue(scrub_workers);
 fail_scrub_workers:
-	return -ENOMEM;
+	return ret;
 }
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	int ret;
 	struct btrfs_device *dev;
 	unsigned int nofs_flag;
-	struct btrfs_workqueue *scrub_workers = NULL;
-	struct btrfs_workqueue *scrub_wr_comp = NULL;
-	struct btrfs_workqueue *scrub_parity = NULL;
 
 	if (btrfs_fs_closing(fs_info))
 		return -EAGAIN;
@@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	if (IS_ERR(sctx))
 		return PTR_ERR(sctx);
 
+	ret = scrub_workers_get(fs_info, is_dev_replace);
+	if (ret)
+		goto out_free_ctx;
+
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
 	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
 		     !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		ret = -ENODEV;
-		goto out_free_ctx;
+		goto out;
 	}
 
 	if (!is_dev_replace && !readonly &&
@@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
 				rcu_str_deref(dev->name));
 		ret = -EROFS;
-		goto out_free_ctx;
+		goto out;
 	}
 
 	mutex_lock(&fs_info->scrub_lock);
@@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		ret = -EIO;
-		goto out_free_ctx;
+		goto out;
 	}
 
 	down_read(&fs_info->dev_replace.rwsem);
@@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		ret = -EINPROGRESS;
-		goto out_free_ctx;
+		goto out;
 	}
 	up_read(&fs_info->dev_replace.rwsem);
 
-	ret = scrub_workers_get(fs_info, is_dev_replace);
-	if (ret) {
-		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		goto out_free_ctx;
-	}
-
 	sctx->readonly = readonly;
 	dev->scrub_ctx = sctx;
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_ctx = NULL;
-	if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
-		scrub_workers = fs_info->scrub_workers;
-		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
-		scrub_parity = fs_info->scrub_parity_workers;
-
-		fs_info->scrub_workers = NULL;
-		fs_info->scrub_wr_completion_workers = NULL;
-		fs_info->scrub_parity_workers = NULL;
-	}
 	mutex_unlock(&fs_info->scrub_lock);
 
-	btrfs_destroy_workqueue(scrub_workers);
-	btrfs_destroy_workqueue(scrub_wr_comp);
-	btrfs_destroy_workqueue(scrub_parity);
+	scrub_workers_put(fs_info);
 	scrub_put_ctx(sctx);
 
 	return ret;
-
+out:
+	scrub_workers_put(fs_info);
 out_free_ctx:
 	scrub_free_ctx(sctx);
 

From ad24466588ab7d7c879053c5afd919b0c555fec0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 10 Aug 2020 11:42:30 -0400
Subject: [PATCH 5/7] btrfs: set the correct lockdep class for new nodes

When flipping over to the rw_semaphore I noticed I'd get a lockdep splat
in replace_path(), which is weird because we're swapping the reloc root
with the actual target root.  Turns out this is because we're using the
root->root_key.objectid as the root id for the newly allocated tree
block when setting the lockdep class, however we need to be using the
actual owner of this new block, which is saved in owner.

The affected path is through btrfs_copy_root as all other callers of
btrfs_alloc_tree_block (which calls init_new_buffer) have root_objectid
== root->root_key.objectid .

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5871ef78edba..e9eedc053fc5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4527,7 +4527,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		return ERR_PTR(-EUCLEAN);
 	}
 
-	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
+	btrfs_set_buffer_lockdep_class(owner, buf, level);
 	btrfs_tree_lock(buf);
 	btrfs_clean_tree_block(buf);
 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);

From d3beaa253fd6fa40b8b18a216398e6e5376a9d21 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 10 Aug 2020 11:42:31 -0400
Subject: [PATCH 6/7] btrfs: set the lockdep class for log tree extent buffers

These are special extent buffers that get rewound in order to lookup
the state of the tree at a specific point in time.  As such they do not
go through the normal initialization paths that set their lockdep class,
so handle them appropriately when they are created and before they are
locked.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cd1cd673bc0b..cd392da69b81 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1297,6 +1297,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 	btrfs_tree_read_unlock_blocking(eb);
 	free_extent_buffer(eb);
 
+	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+				       eb_rewin, btrfs_header_level(eb_rewin));
 	btrfs_tree_read_lock(eb_rewin);
 	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
 	WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1370,7 +1372,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 
 	if (!eb)
 		return NULL;
-	btrfs_tree_read_lock(eb);
 	if (old_root) {
 		btrfs_set_header_bytenr(eb, eb->start);
 		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1378,6 +1379,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		btrfs_set_header_level(eb, old_root->level);
 		btrfs_set_header_generation(eb, old_generation);
 	}
+	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+				       btrfs_header_level(eb));
+	btrfs_tree_read_lock(eb);
 	if (tm)
 		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
 	else

From f96d6960abbc52e26ad124e69e6815283d3e1674 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 25 Aug 2020 21:42:51 +0800
Subject: [PATCH 7/7] btrfs: tree-checker: fix the error message for transid
 error

The error message for inode transid is the same as for inode generation,
which makes us unable to detect the real problem.

Reported-by: Tyler Richmond <t.d.richmond@gmail.com>
Fixes: 496245cac57e ("btrfs: tree-checker: Verify inode item")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Marcos Paulo de Souza <mpdesouza@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 517b44300a05..7b1fee630f97 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf,
 	/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
 	if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
 		inode_item_err(leaf, slot,
-			"invalid inode generation: has %llu expect [0, %llu]",
+			"invalid inode transid: has %llu expect [0, %llu]",
 			       btrfs_inode_transid(leaf, iitem), super_gen + 1);
 		return -EUCLEAN;
 	}