forked from Minki/linux
btrfs: preallocate anon block device at first phase of snapshot creation
[BUG] When the anonymous block device pool is exhausted, subvolume/snapshot creation fails with EMFILE (Too many files open). This has been reported by a user. The allocation happens in the second phase during transaction commit where it's only way out is to abort the transaction BTRFS: Transaction aborted (error -24) WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] Call Trace: create_pending_snapshots+0x82/0xa0 [btrfs] btrfs_commit_transaction+0x275/0x8c0 [btrfs] btrfs_mksubvol+0x4b9/0x500 [btrfs] btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] btrfs_ioctl+0x11a4/0x2da0 [btrfs] do_vfs_ioctl+0xa9/0x640 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 33f2f83f3d5250e9 ]--- BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown BTRFS info (device sda1): forced readonly BTRFS warning (device sda1): Skipping commit of aborted transaction. BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown [CAUSE] When the global anonymous block device pool is exhausted, the following call chain will fail, and lead to transaction abort: btrfs_ioctl_snap_create_v2() |- btrfs_ioctl_snap_create_transid() |- btrfs_mksubvol() |- btrfs_commit_transaction() |- create_pending_snapshot() |- btrfs_get_fs_root() |- btrfs_init_fs_root() |- get_anon_bdev() [FIX] Although we can't enlarge the anonymous block device pool, at least we can preallocate anon_dev for subvolume/snapshot in the first phase, outside of transaction context and exactly at the moment the user calls the creation ioctl. Reported-by: Greed Rong <greedrong@gmail.com> Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo <wqu@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
082b6c970f
commit
2dfb1e43f5
@ -1391,7 +1391,12 @@ alloc_fail:
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int btrfs_init_fs_root(struct btrfs_root *root)
|
/*
|
||||||
|
* Initialize subvolume root in-memory structure
|
||||||
|
*
|
||||||
|
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
|
||||||
|
*/
|
||||||
|
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
unsigned int nofs_flag;
|
unsigned int nofs_flag;
|
||||||
@ -1430,9 +1435,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
|
|||||||
*/
|
*/
|
||||||
if (is_fstree(root->root_key.objectid) &&
|
if (is_fstree(root->root_key.objectid) &&
|
||||||
btrfs_root_refs(&root->root_item) > 0) {
|
btrfs_root_refs(&root->root_item) > 0) {
|
||||||
ret = get_anon_bdev(&root->anon_dev);
|
if (!anon_dev) {
|
||||||
if (ret)
|
ret = get_anon_bdev(&root->anon_dev);
|
||||||
goto fail;
|
if (ret)
|
||||||
|
goto fail;
|
||||||
|
} else {
|
||||||
|
root->anon_dev = anon_dev;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&root->objectid_mutex);
|
mutex_lock(&root->objectid_mutex);
|
||||||
@ -1537,8 +1546,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
|
/*
|
||||||
u64 objectid, bool check_ref)
|
* Get an in-memory reference of a root structure.
|
||||||
|
*
|
||||||
|
* For essential trees like root/extent tree, we grab it from fs_info directly.
|
||||||
|
* For subvolume trees, we check the cached filesystem roots first. If not
|
||||||
|
* found, then read it from disk and add it to cached fs roots.
|
||||||
|
*
|
||||||
|
* Caller should release the root by calling btrfs_put_root() after the usage.
|
||||||
|
*
|
||||||
|
* NOTE: Reloc and log trees can't be read by this function as they share the
|
||||||
|
* same root objectid.
|
||||||
|
*
|
||||||
|
* @objectid: root id
|
||||||
|
* @anon_dev: preallocated anonymous block device number for new roots,
|
||||||
|
* pass 0 for new allocation.
|
||||||
|
* @check_ref: whether to check root item references, If true, return -ENOENT
|
||||||
|
* for orphan roots
|
||||||
|
*/
|
||||||
|
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
|
||||||
|
u64 objectid, dev_t anon_dev,
|
||||||
|
bool check_ref)
|
||||||
{
|
{
|
||||||
struct btrfs_root *root;
|
struct btrfs_root *root;
|
||||||
struct btrfs_path *path;
|
struct btrfs_path *path;
|
||||||
@ -1567,6 +1595,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
|
|||||||
again:
|
again:
|
||||||
root = btrfs_lookup_fs_root(fs_info, objectid);
|
root = btrfs_lookup_fs_root(fs_info, objectid);
|
||||||
if (root) {
|
if (root) {
|
||||||
|
/* Shouldn't get preallocated anon_dev for cached roots */
|
||||||
|
ASSERT(!anon_dev);
|
||||||
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
|
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
|
||||||
btrfs_put_root(root);
|
btrfs_put_root(root);
|
||||||
return ERR_PTR(-ENOENT);
|
return ERR_PTR(-ENOENT);
|
||||||
@ -1586,7 +1616,7 @@ again:
|
|||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = btrfs_init_fs_root(root);
|
ret = btrfs_init_fs_root(root, anon_dev);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
@ -1619,6 +1649,33 @@ fail:
|
|||||||
return ERR_PTR(ret);
|
return ERR_PTR(ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get in-memory reference of a root structure
|
||||||
|
*
|
||||||
|
* @objectid: tree objectid
|
||||||
|
* @check_ref: if set, verify that the tree exists and the item has at least
|
||||||
|
* one reference
|
||||||
|
*/
|
||||||
|
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
|
||||||
|
u64 objectid, bool check_ref)
|
||||||
|
{
|
||||||
|
return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Get in-memory reference of a root structure, created as new, optionally pass
|
||||||
|
* the anonymous block device id
|
||||||
|
*
|
||||||
|
* @objectid: tree objectid
|
||||||
|
* @anon_dev: if zero, allocate a new anonymous block device or use the
|
||||||
|
* parameter value
|
||||||
|
*/
|
||||||
|
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
|
||||||
|
u64 objectid, dev_t anon_dev)
|
||||||
|
{
|
||||||
|
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
|
||||||
|
}
|
||||||
|
|
||||||
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
|
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
|
||||||
{
|
{
|
||||||
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
|
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
|
||||||
|
@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
|
|||||||
|
|
||||||
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
|
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
|
||||||
u64 objectid, bool check_ref);
|
u64 objectid, bool check_ref);
|
||||||
|
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
|
||||||
|
u64 objectid, dev_t anon_dev);
|
||||||
|
|
||||||
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
|
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
|
||||||
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
|
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
|
||||||
|
@ -566,6 +566,7 @@ static noinline int create_subvol(struct inode *dir,
|
|||||||
struct inode *inode;
|
struct inode *inode;
|
||||||
int ret;
|
int ret;
|
||||||
int err;
|
int err;
|
||||||
|
dev_t anon_dev = 0;
|
||||||
u64 objectid;
|
u64 objectid;
|
||||||
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
|
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
|
||||||
u64 index = 0;
|
u64 index = 0;
|
||||||
@ -578,6 +579,10 @@ static noinline int create_subvol(struct inode *dir,
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto fail_free;
|
goto fail_free;
|
||||||
|
|
||||||
|
ret = get_anon_bdev(&anon_dev);
|
||||||
|
if (ret < 0)
|
||||||
|
goto fail_free;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't create subvolume whose level is not zero. Or qgroup will be
|
* Don't create subvolume whose level is not zero. Or qgroup will be
|
||||||
* screwed up since it assumes subvolume qgroup's level to be 0.
|
* screwed up since it assumes subvolume qgroup's level to be 0.
|
||||||
@ -660,12 +665,15 @@ static noinline int create_subvol(struct inode *dir,
|
|||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
key.offset = (u64)-1;
|
key.offset = (u64)-1;
|
||||||
new_root = btrfs_get_fs_root(fs_info, objectid, true);
|
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
|
||||||
if (IS_ERR(new_root)) {
|
if (IS_ERR(new_root)) {
|
||||||
|
free_anon_bdev(anon_dev);
|
||||||
ret = PTR_ERR(new_root);
|
ret = PTR_ERR(new_root);
|
||||||
btrfs_abort_transaction(trans, ret);
|
btrfs_abort_transaction(trans, ret);
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
/* Freeing will be done in btrfs_put_root() of new_root */
|
||||||
|
anon_dev = 0;
|
||||||
|
|
||||||
btrfs_record_root_in_trans(trans, new_root);
|
btrfs_record_root_in_trans(trans, new_root);
|
||||||
|
|
||||||
@ -735,6 +743,8 @@ fail:
|
|||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
fail_free:
|
fail_free:
|
||||||
|
if (anon_dev)
|
||||||
|
free_anon_bdev(anon_dev);
|
||||||
kfree(root_item);
|
kfree(root_item);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -762,6 +772,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
|
|||||||
if (!pending_snapshot)
|
if (!pending_snapshot)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ret = get_anon_bdev(&pending_snapshot->anon_dev);
|
||||||
|
if (ret < 0)
|
||||||
|
goto free_pending;
|
||||||
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
|
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
pending_snapshot->path = btrfs_alloc_path();
|
pending_snapshot->path = btrfs_alloc_path();
|
||||||
@ -823,10 +836,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
|
|||||||
|
|
||||||
d_instantiate(dentry, inode);
|
d_instantiate(dentry, inode);
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
pending_snapshot->anon_dev = 0;
|
||||||
fail:
|
fail:
|
||||||
|
/* Prevent double freeing of anon_dev */
|
||||||
|
if (ret && pending_snapshot->snap)
|
||||||
|
pending_snapshot->snap->anon_dev = 0;
|
||||||
btrfs_put_root(pending_snapshot->snap);
|
btrfs_put_root(pending_snapshot->snap);
|
||||||
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
|
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
|
||||||
free_pending:
|
free_pending:
|
||||||
|
if (pending_snapshot->anon_dev)
|
||||||
|
free_anon_bdev(pending_snapshot->anon_dev);
|
||||||
kfree(pending_snapshot->root_item);
|
kfree(pending_snapshot->root_item);
|
||||||
btrfs_free_path(pending_snapshot->path);
|
btrfs_free_path(pending_snapshot->path);
|
||||||
kfree(pending_snapshot);
|
kfree(pending_snapshot);
|
||||||
|
@ -1630,7 +1630,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
|
|||||||
}
|
}
|
||||||
|
|
||||||
key.offset = (u64)-1;
|
key.offset = (u64)-1;
|
||||||
pending->snap = btrfs_get_fs_root(fs_info, objectid, true);
|
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
|
||||||
if (IS_ERR(pending->snap)) {
|
if (IS_ERR(pending->snap)) {
|
||||||
ret = PTR_ERR(pending->snap);
|
ret = PTR_ERR(pending->snap);
|
||||||
btrfs_abort_transaction(trans, ret);
|
btrfs_abort_transaction(trans, ret);
|
||||||
|
@ -151,6 +151,8 @@ struct btrfs_pending_snapshot {
|
|||||||
struct btrfs_block_rsv block_rsv;
|
struct btrfs_block_rsv block_rsv;
|
||||||
/* extra metadata reservation for relocation */
|
/* extra metadata reservation for relocation */
|
||||||
int error;
|
int error;
|
||||||
|
/* Preallocated anonymous block device number */
|
||||||
|
dev_t anon_dev;
|
||||||
bool readonly;
|
bool readonly;
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user