btrfs: preallocate anon block device at first phase of snapshot creation

[BUG]
When the anonymous block device pool is exhausted, subvolume/snapshot
creation fails with EMFILE (Too many files open). This has been reported
by a user. The allocation happens in the second phase during transaction
commit where it's only way out is to abort the transaction

  BTRFS: Transaction aborted (error -24)
  WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs]
  RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs]
  Call Trace:
   create_pending_snapshots+0x82/0xa0 [btrfs]
   btrfs_commit_transaction+0x275/0x8c0 [btrfs]
   btrfs_mksubvol+0x4b9/0x500 [btrfs]
   btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
   btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
   btrfs_ioctl+0x11a4/0x2da0 [btrfs]
   do_vfs_ioctl+0xa9/0x640
   ksys_ioctl+0x67/0x90
   __x64_sys_ioctl+0x1a/0x20
   do_syscall_64+0x5a/0x110
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  ---[ end trace 33f2f83f3d5250e9 ]---
  BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown
  BTRFS info (device sda1): forced readonly
  BTRFS warning (device sda1): Skipping commit of aborted transaction.
  BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown

[CAUSE]
When the global anonymous block device pool is exhausted, the following
call chain will fail, and lead to transaction abort:

 btrfs_ioctl_snap_create_v2()
 |- btrfs_ioctl_snap_create_transid()
    |- btrfs_mksubvol()
       |- btrfs_commit_transaction()
          |- create_pending_snapshot()
             |- btrfs_get_fs_root()
                |- btrfs_init_fs_root()
                   |- get_anon_bdev()

[FIX]
Although we can't enlarge the anonymous block device pool, at least we
can preallocate anon_dev for subvolume/snapshot in the first phase,
outside of transaction context and exactly at the moment the user calls
the creation ioctl.

Reported-by: Greed Rong <greedrong@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2020-06-16 10:17:36 +08:00 committed by David Sterba
parent 082b6c970f
commit 2dfb1e43f5
5 changed files with 89 additions and 9 deletions

View File

@ -1391,7 +1391,12 @@ alloc_fail:
goto out; goto out;
} }
static int btrfs_init_fs_root(struct btrfs_root *root) /*
* Initialize subvolume root in-memory structure
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{ {
int ret; int ret;
unsigned int nofs_flag; unsigned int nofs_flag;
@ -1430,9 +1435,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
*/ */
if (is_fstree(root->root_key.objectid) && if (is_fstree(root->root_key.objectid) &&
btrfs_root_refs(&root->root_item) > 0) { btrfs_root_refs(&root->root_item) > 0) {
ret = get_anon_bdev(&root->anon_dev); if (!anon_dev) {
if (ret) ret = get_anon_bdev(&root->anon_dev);
goto fail; if (ret)
goto fail;
} else {
root->anon_dev = anon_dev;
}
} }
mutex_lock(&root->objectid_mutex); mutex_lock(&root->objectid_mutex);
@ -1537,8 +1546,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
} }
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, /*
u64 objectid, bool check_ref) * Get an in-memory reference of a root structure.
*
* For essential trees like root/extent tree, we grab it from fs_info directly.
* For subvolume trees, we check the cached filesystem roots first. If not
* found, then read it from disk and add it to cached fs roots.
*
* Caller should release the root by calling btrfs_put_root() after the usage.
*
* NOTE: Reloc and log trees can't be read by this function as they share the
* same root objectid.
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
* pass 0 for new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev,
bool check_ref)
{ {
struct btrfs_root *root; struct btrfs_root *root;
struct btrfs_path *path; struct btrfs_path *path;
@ -1567,6 +1595,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
again: again:
root = btrfs_lookup_fs_root(fs_info, objectid); root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) { if (root) {
/* Shouldn't get preallocated anon_dev for cached roots */
ASSERT(!anon_dev);
if (check_ref && btrfs_root_refs(&root->root_item) == 0) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root); btrfs_put_root(root);
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
@ -1586,7 +1616,7 @@ again:
goto fail; goto fail;
} }
ret = btrfs_init_fs_root(root); ret = btrfs_init_fs_root(root, anon_dev);
if (ret) if (ret)
goto fail; goto fail;
@ -1619,6 +1649,33 @@ fail:
return ERR_PTR(ret); return ERR_PTR(ret);
} }
/*
* Get in-memory reference of a root structure
*
* @objectid: tree objectid
* @check_ref: if set, verify that the tree exists and the item has at least
* one reference
*/
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
}
/*
* Get in-memory reference of a root structure, created as new, optionally pass
* the anonymous block device id
*
* @objectid: tree objectid
* @anon_dev: if zero, allocate a new anonymous block device or use the
* parameter value
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits) static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{ {
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;

View File

@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref); u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, dev_t anon_dev);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);

View File

@ -566,6 +566,7 @@ static noinline int create_subvol(struct inode *dir,
struct inode *inode; struct inode *inode;
int ret; int ret;
int err; int err;
dev_t anon_dev = 0;
u64 objectid; u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0; u64 index = 0;
@ -578,6 +579,10 @@ static noinline int create_subvol(struct inode *dir,
if (ret) if (ret)
goto fail_free; goto fail_free;
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
goto fail_free;
/* /*
* Don't create subvolume whose level is not zero. Or qgroup will be * Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0. * screwed up since it assumes subvolume qgroup's level to be 0.
@ -660,12 +665,15 @@ static noinline int create_subvol(struct inode *dir,
goto fail; goto fail;
key.offset = (u64)-1; key.offset = (u64)-1;
new_root = btrfs_get_fs_root(fs_info, objectid, true); new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) { if (IS_ERR(new_root)) {
free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root); ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);
goto fail; goto fail;
} }
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root); btrfs_record_root_in_trans(trans, new_root);
@ -735,6 +743,8 @@ fail:
return ret; return ret;
fail_free: fail_free:
if (anon_dev)
free_anon_bdev(anon_dev);
kfree(root_item); kfree(root_item);
return ret; return ret;
} }
@ -762,6 +772,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!pending_snapshot) if (!pending_snapshot)
return -ENOMEM; return -ENOMEM;
ret = get_anon_bdev(&pending_snapshot->anon_dev);
if (ret < 0)
goto free_pending;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL); GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path(); pending_snapshot->path = btrfs_alloc_path();
@ -823,10 +836,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
d_instantiate(dentry, inode); d_instantiate(dentry, inode);
ret = 0; ret = 0;
pending_snapshot->anon_dev = 0;
fail: fail:
/* Prevent double freeing of anon_dev */
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap); btrfs_put_root(pending_snapshot->snap);
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
free_pending: free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
kfree(pending_snapshot->root_item); kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path); btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot); kfree(pending_snapshot);

View File

@ -1630,7 +1630,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
} }
key.offset = (u64)-1; key.offset = (u64)-1;
pending->snap = btrfs_get_fs_root(fs_info, objectid, true); pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) { if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap); ret = PTR_ERR(pending->snap);
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);

View File

@ -151,6 +151,8 @@ struct btrfs_pending_snapshot {
struct btrfs_block_rsv block_rsv; struct btrfs_block_rsv block_rsv;
/* extra metadata reservation for relocation */ /* extra metadata reservation for relocation */
int error; int error;
/* Preallocated anonymous block device number */
dev_t anon_dev;
bool readonly; bool readonly;
struct list_head list; struct list_head list;
}; };