for-5.3-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAl0sNWYACgkQxWXV+ddt
 WDsyQA/8CGnF68g6hwVuYz4K7f39gOiFlBnRxeN/3RT6vkNSyLZxvRDaDrSTzVIo
 cz2G/9qZLXsIll+3EfZlyzZZiA+4f4hEDAfAd4yVPavRom+uu7dbqzAIpgvFlYdH
 vhAYKOeWSqWElWJ06hzWO3FCwjY9GKFMk4PS0XHHp+STCT0hq1MkaHr44kiHsqdh
 T5nVGDwXz8nGDZ51RO6+mgiSrd5eHbs6kXCd8rW7hmjTx8ClKHa1tdkxN/us+pJm
 hTFT669m5ckHhY2AUKmkREoOwpnt2HcXQJNkz6gO+o03IDvYz73SScbhSYdNTlwi
 j74GLf89FA52qVM+JDg9MaWYqgf1pQI8AHK/rXw2FNbuP/eL9kuZ85ZIbO6CiO0c
 5jAixReSwzSP/V0+MKW3F7k4KtIqbHAV6mkI8zLwrAee4Xj81BOtgL7gYPFQTwSZ
 ma0hEoen7IV5+/z9upUuLA5wr4BT+h1T+EllCWe1+9+9mRYOvowtkRNBL8HZWTDI
 b65oTITfot54xX9ecKtiuG2qoqJEjjkR+YKdRM4nph6wflSNZxEoezBp3iRFpYOL
 Lx+g97RcJ2EEoBVjVMkTqfj93GeiKRifa8yXdRY+A0I2ZXZEcS8DjSJM6rj3AOPy
 4idIl+ABscayZowfqu0FSIULf1La0qiRXmbGNeG4ylhN4L6S/og=
 =eshk
 -----END PGP SIGNATURE-----

Merge tag 'for-5.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "Highlights:

   - chunks that have been trimmed and unchanged since last mount are
     tracked and skipped on repeated trims

   - use hw assissed crc32c on more arches, speedups if native
     instructions or optimized implementation is available

   - the RAID56 incompat bit is automatically removed when the last
     block group of that type is removed

  Fixes:

   - fsync fix for reflink on NODATACOW files that could lead to ENOSPC

   - fix data loss after inode eviction, renaming it, and fsync it

   - fix fsync not persisting dentry deletions due to inode evictions

   - update ctime/mtime/iversion after hole punching

   - fix compression type validation (reported by KASAN)

   - send won't be allowed to start when relocation is in progress, this
     can cause spurious errors or produce incorrect send stream

  Core:

   - new tracepoints for space update

   - tree-checker: better check for end of extents for some tree items

   - preparatory work for more checksum algorithms

   - run delayed iput at unlink time and don't push the work to cleaner
     thread where it's not properly throttled

   - wrap block mapping to structures and helpers, base for further
     refactoring

   - split large files, part 1:
       - space info handling
       - block group reservations
       - delayed refs
       - delayed allocation

   - other cleanups and refactoring"

* tag 'for-5.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (103 commits)
  btrfs: fix memory leak of path on error return path
  btrfs: move the subvolume reservation stuff out of extent-tree.c
  btrfs: migrate the delalloc space stuff to it's own home
  btrfs: migrate btrfs_trans_release_chunk_metadata
  btrfs: migrate the delayed refs rsv code
  btrfs: Evaluate io_tree in find_lock_delalloc_range()
  btrfs: migrate the global_block_rsv helpers to block-rsv.c
  btrfs: migrate the block-rsv code to block-rsv.c
  btrfs: stop using block_rsv_release_bytes everywhere
  btrfs: cleanup the target logic in __btrfs_block_rsv_release
  btrfs: export __btrfs_block_rsv_release
  btrfs: export btrfs_block_rsv_add_bytes
  btrfs: move btrfs_block_rsv definitions into it's own header
  btrfs: Simplify update of space_info in __reserve_metadata_bytes()
  btrfs: unexport can_overcommit
  btrfs: move reserve_metadata_bytes and supporting code to space-info.c
  btrfs: move dump_space_info to space-info.c
  btrfs: export block_rsv_use_bytes
  btrfs: move btrfs_space_info_add_*_bytes to space-info.c
  btrfs: move the space info update macro to space-info.h
  ...
This commit is contained in:
Linus Torvalds 2019-07-16 15:12:56 -07:00
commit a18f877541
52 changed files with 3831 additions and 3127 deletions

View File

@ -2,7 +2,8 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select LIBCRC32C
select CRYPTO
select CRYPTO_CRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS

View File

@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

View File

@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct ulist *roots, struct ulist *tmp)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
struct ulist *tmp = NULL;
struct ulist *roots = NULL;
struct ulist_iterator uiter;
struct ulist_node *node;
struct seq_list elem = SEQ_LIST_INIT(elem);
@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
.share_count = 0,
};
tmp = ulist_alloc(GFP_NOFS);
roots = ulist_alloc(GFP_NOFS);
if (!tmp || !roots) {
ret = -ENOMEM;
goto out;
}
ulist_init(roots);
ulist_init(tmp);
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
up_read(&fs_info->commit_root_sem);
}
out:
ulist_free(tmp);
ulist_free(roots);
ulist_release(roots);
ulist_release(tmp);
return ret;
}

View File

@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct ulist *roots, struct ulist *tmp_ulist);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);

425
fs/btrfs/block-rsv.c Normal file
View File

@ -0,0 +1,425 @@
// SPDX-License-Identifier: GPL-2.0
#include "ctree.h"
#include "block-rsv.h"
#include "space-info.h"
#include "math.h"
#include "transaction.h"
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes,
u64 *qgroup_to_release_ret)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
u64 qgroup_to_release = 0;
u64 ret;
spin_lock(&block_rsv->lock);
if (num_bytes == (u64)-1) {
num_bytes = block_rsv->size;
qgroup_to_release = block_rsv->qgroup_rsv_size;
}
block_rsv->size -= num_bytes;
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
} else {
num_bytes = 0;
}
if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
qgroup_to_release = block_rsv->qgroup_rsv_reserved -
block_rsv->qgroup_rsv_size;
block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
} else {
qgroup_to_release = 0;
}
spin_unlock(&block_rsv->lock);
ret = num_bytes;
if (num_bytes > 0) {
if (dest) {
spin_lock(&dest->lock);
if (!dest->full) {
u64 bytes_to_add;
bytes_to_add = dest->size - dest->reserved;
bytes_to_add = min(num_bytes, bytes_to_add);
dest->reserved += bytes_to_add;
if (dest->reserved >= dest->size)
dest->full = 1;
num_bytes -= bytes_to_add;
}
spin_unlock(&dest->lock);
}
if (num_bytes)
btrfs_space_info_add_old_bytes(fs_info, space_info,
num_bytes);
}
if (qgroup_to_release_ret)
*qgroup_to_release_ret = qgroup_to_release;
return ret;
}
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
bool update_size)
{
int ret;
ret = btrfs_block_rsv_use_bytes(src, num_bytes);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
return 0;
}
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
{
memset(rsv, 0, sizeof(*rsv));
spin_lock_init(&rsv->lock);
rsv->type = type;
}
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
unsigned short type)
{
btrfs_init_block_rsv(rsv, type);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
}
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type)
{
struct btrfs_block_rsv *block_rsv;
block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
if (!block_rsv)
return NULL;
btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
return block_rsv;
}
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
if (!rsv)
return;
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
kfree(rsv);
}
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush)
{
int ret;
if (num_bytes == 0)
return 0;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret)
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
num_bytes = div_factor(block_rsv->size, min_factor);
if (block_rsv->reserved >= num_bytes)
ret = 0;
spin_unlock(&block_rsv->lock);
return ret;
}
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
if (!block_rsv)
return 0;
spin_lock(&block_rsv->lock);
num_bytes = min_reserved;
if (block_rsv->reserved >= num_bytes)
ret = 0;
else
num_bytes -= block_rsv->reserved;
spin_unlock(&block_rsv->lock);
if (!ret)
return 0;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
return ret;
}
u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *target = NULL;
/*
* If we are the delayed_rsv then push to the global rsv, otherwise dump
* into the delayed rsv if it is not full.
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
else if (block_rsv != global_rsv && !delayed_rsv->full)
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
target = NULL;
return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
qgroup_to_release);
}
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
{
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved >= num_bytes) {
block_rsv->reserved -= num_bytes;
if (block_rsv->reserved < block_rsv->size)
block_rsv->full = 0;
ret = 0;
}
spin_unlock(&block_rsv->lock);
return ret;
}
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
if (update_size)
block_rsv->size += num_bytes;
else if (block_rsv->reserved >= block_rsv->size)
block_rsv->full = 1;
spin_unlock(&block_rsv->lock);
}
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
if (global_rsv->space_info != dest->space_info)
return -ENOSPC;
spin_lock(&global_rsv->lock);
min_bytes = div_factor(global_rsv->size, min_factor);
if (global_rsv->reserved < min_bytes + num_bytes) {
spin_unlock(&global_rsv->lock);
return -ENOSPC;
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
/*
* The global block rsv is based on the size of the extent tree, the
* checksum tree and the root tree. If the fs is empty we want to set
* it to a minimal amount for safety.
*/
num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
btrfs_root_used(&fs_info->csum_root->root_item) +
btrfs_root_used(&fs_info->tree_root->root_item);
num_bytes = max_t(u64, num_bytes, SZ_16M);
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
block_rsv->size = min_t(u64, num_bytes, SZ_512M);
if (block_rsv->reserved < block_rsv->size) {
num_bytes = btrfs_space_info_used(sinfo, true);
if (sinfo->total_bytes > num_bytes) {
num_bytes = sinfo->total_bytes - num_bytes;
num_bytes = min(num_bytes,
block_rsv->size - block_rsv->reserved);
block_rsv->reserved += num_bytes;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes,
1);
}
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
-num_bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes, 0);
block_rsv->reserved = block_rsv->size;
}
if (block_rsv->reserved == block_rsv->size)
block_rsv->full = 1;
else
block_rsv->full = 0;
spin_unlock(&block_rsv->lock);
spin_unlock(&sinfo->lock);
}
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *space_info;
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
fs_info->empty_block_rsv.space_info = space_info;
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
if (fs_info->quota_root)
fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
btrfs_update_global_block_rsv(fs_info);
}
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1);
WARN_ON(fs_info->trans_block_rsv.size > 0);
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_block_rsv.size > 0);
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
WARN_ON(fs_info->delayed_refs_rsv.size > 0);
}
static struct btrfs_block_rsv *get_block_rsv(
const struct btrfs_trans_handle *trans,
const struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = NULL;
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
(root == fs_info->csum_root && trans->adding_csums) ||
(root == fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
block_rsv = root->block_rsv;
if (!block_rsv)
block_rsv = &fs_info->empty_block_rsv;
return block_rsv;
}
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
bool global_updated = false;
block_rsv = get_block_rsv(trans, root);
if (unlikely(block_rsv->size == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
if (!ret)
return block_rsv;
if (block_rsv->failfast)
return ERR_PTR(ret);
if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
global_updated = true;
btrfs_update_global_block_rsv(fs_info);
goto again;
}
/*
* The global reserve still exists to save us from ourselves, so don't
* warn_on if we are short on our delayed refs reserve.
*/
if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
"BTRFS: block rsv returned %d\n", ret);
}
try_reserve:
ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
BTRFS_RESERVE_NO_FLUSH);
if (!ret)
return block_rsv;
/*
* If we couldn't reserve metadata bytes try and use some from
* the global reserve if its space type is the same as the global
* reservation.
*/
if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
block_rsv->space_info == global_rsv->space_info) {
ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
if (!ret)
return global_rsv;
}
return ERR_PTR(ret);
}

101
fs/btrfs/block-rsv.h Normal file
View File

@ -0,0 +1,101 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_BLOCK_RSV_H
#define BTRFS_BLOCK_RSV_H
struct btrfs_trans_handle;
enum btrfs_reserve_flush_enum;
/*
* Types of block reserves
*/
enum {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
BTRFS_BLOCK_RSV_CHUNK,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
BTRFS_BLOCK_RSV_EMPTY,
BTRFS_BLOCK_RSV_TEMP,
};
struct btrfs_block_rsv {
u64 size;
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
unsigned short full;
unsigned short type;
unsigned short failfast;
/*
* Qgroup equivalent for @size @reserved
*
* Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
* about things like csum size nor how many tree blocks it will need to
* reserve.
*
* Qgroup cares more about net change of the extent usage.
*
* So for one newly inserted file extent, in worst case it will cause
* leaf split and level increase, nodesize for each file extent is
* already too much.
*
* In short, qgroup_size/reserved is the upper limit of possible needed
* qgroup metadata reservation.
*/
u64 qgroup_rsv_size;
u64 qgroup_rsv_reserved;
};
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
bool update_size);
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes, u64 *qgroup_to_release);
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info);
void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info);
void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info);
struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u32 blocksize);
static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
{
__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
}
static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u32 blocksize)
{
btrfs_block_rsv_add_bytes(block_rsv, blocksize, false);
btrfs_block_rsv_release(fs_info, block_rsv, 0);
}
#endif /* BTRFS_BLOCK_RSV_H */

View File

@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
}
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u32 csum, u32 csum_expected, int mirror_num)
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
struct btrfs_super_block *sb = root->fs_info->super_copy;
const u16 csum_size = btrfs_super_csum_size(sb);
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
}
#endif

View File

@ -83,7 +83,7 @@
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/crc32c.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
char **datav, unsigned int num_pages)
{
struct btrfs_fs_info *fs_info = state->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_header *h;
u8 csum[BTRFS_CSUM_SIZE];
u32 crc = ~(u32)0;
unsigned int i;
if (num_pages * PAGE_SIZE < state->metablock_size)
@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE))
return 1;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
for (i = 0; i < num_pages; i++) {
u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
size_t sublen = i ? PAGE_SIZE :
(PAGE_SIZE - BTRFS_CSUM_SIZE);
crc = crc32c(crc, data, sublen);
crypto_shash_update(shash, data, sublen);
}
btrfs_csum_final(crc, csum);
crypto_shash_final(shash, csum);
if (memcmp(csum, h->csum, state->csum_size))
return 1;

View File

@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -42,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
return NULL;
}
bool btrfs_compress_is_valid_type(const char *str, size_t len)
{
int i;
for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
size_t comp_len = strlen(btrfs_compress_types[i]);
if (len < comp_len)
continue;
if (!strncmp(btrfs_compress_types[i], str, comp_len))
return true;
}
return false;
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
@ -57,32 +74,37 @@ static int check_compressed_csum(struct btrfs_inode *inode,
struct compressed_bio *cb,
u64 disk_start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int ret;
struct page *page;
unsigned long i;
char *kaddr;
u32 csum;
u32 *cb_sum = &cb->sums;
u8 csum[BTRFS_CSUM_SIZE];
u8 *cb_sum = cb->sums;
if (inode->flags & BTRFS_INODE_NODATASUM)
return 0;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < cb->nr_pages; i++) {
page = cb->compressed_pages[i];
csum = ~(u32)0;
crypto_shash_init(shash);
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
btrfs_csum_final(csum, (u8 *)&csum);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
kunmap_atomic(kaddr);
crypto_shash_final(shash, (u8 *)&csum);
if (csum != *cb_sum) {
btrfs_print_data_csum_error(inode, disk_start, csum,
*cb_sum, cb->mirror_num);
if (memcmp(&csum, cb_sum, csum_size)) {
btrfs_print_data_csum_error(inode, disk_start,
csum, cb_sum, cb->mirror_num);
ret = -EIO;
goto fail;
}
cb_sum++;
cb_sum += csum_size;
}
ret = 0;
@ -318,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = fs_info->fs_devices->latest_bdev;
bio = btrfs_bio_alloc(bdev, first_byte);
bio = btrfs_bio_alloc(first_byte);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@ -360,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_endio(bio);
}
bio = btrfs_bio_alloc(bdev, first_byte);
bio = btrfs_bio_alloc(first_byte);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
@ -536,7 +560,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct extent_map *em;
blk_status_t ret = BLK_STS_RESOURCE;
int faili = 0;
u32 *sums;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@ -558,7 +583,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->errors = 0;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = &cb->sums;
sums = cb->sums;
cb->start = em->orig_start;
em_len = em->len;
@ -597,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
/* include any pages we added in add_ra-bio_pages */
cb->len = bio->bi_iter.bi_size;
comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
comp_bio = btrfs_bio_alloc(cur_disk_byte);
bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
@ -617,6 +643,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->mapping = NULL;
if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
PAGE_SIZE) {
unsigned int nr_sectors;
ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
BTRFS_WQ_ENDIO_DATA);
BUG_ON(ret); /* -ENOMEM */
@ -634,8 +662,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
sums);
BUG_ON(ret); /* -ENOMEM */
}
sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
fs_info->sectorsize);
sums += csum_size * nr_sectors;
ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
if (ret) {
@ -643,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bio_endio(comp_bio);
}
comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte);
comp_bio = btrfs_bio_alloc(cur_disk_byte);
bio_set_dev(comp_bio, bdev);
comp_bio->bi_opf = REQ_OP_READ;
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;

View File

@ -61,7 +61,7 @@ struct compressed_bio {
* the start of a variable length array of checksums only
* used by reads
*/
u32 sums;
u8 sums[];
};
static inline unsigned int btrfs_compress_type(unsigned int type_level)
@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
const char* btrfs_compress_type2str(enum btrfs_compression_type type);
bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);

View File

@ -19,6 +19,7 @@
#include <linux/kobject.h>
#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
#include <asm/unaligned.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
#include <linux/btrfs_tree.h>
@ -31,11 +32,13 @@
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
struct btrfs_pending_snapshot;
struct btrfs_delayed_ref_root;
struct btrfs_space_info;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
@ -45,7 +48,16 @@ struct btrfs_ref;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
/*
* Maximum number of mirrors that can be available for all profiles counting
* the target device of dev-replace as one. During an active device replace
* procedure, the target device of the copy operation is a mirror for the
* filesystem data as well that can be used to read data in order to repair
* read errors on other disks.
*
* Current value is derived from RAID1 with 2 copies.
*/
#define BTRFS_MAX_MIRRORS (2 + 1)
#define BTRFS_MAX_LEVEL 8
@ -72,6 +84,7 @@ struct btrfs_ref;
/* four bytes for CRC32 */
static const int btrfs_csum_sizes[] = { 4 };
static const char *btrfs_csum_names[] = { "crc32c" };
#define BTRFS_EMPTY_DIR_SIZE 0
@ -99,10 +112,6 @@ static inline u32 count_max_extents(u64 size)
return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
}
struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
};
static inline unsigned long btrfs_chunk_item_size(int num_stripes)
{
BUG_ON(num_stripes == 0);
@ -395,115 +404,6 @@ struct raid_kobject {
struct list_head list;
};
struct btrfs_space_info {
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
this doesn't take mirrors into account */
u64 bytes_used; /* total bytes used,
this doesn't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
unsigned int flush:1; /* set if we are trying to make space */
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
u64 disk_used; /* total bytes used on disk */
u64 disk_total; /* total bytes on disk, takes mirrors into
account */
u64 flags;
/*
* bytes_pinned is kept in line with what is actually pinned, as in
* we've called update_block_group and dropped the bytes_used counter
* and increased the bytes_pinned counter. However this means that
* bytes_pinned does not reflect the bytes that will be pinned once the
* delayed refs are flushed, so this counter is inc'ed every time we
* call btrfs_free_extent so it is a realtime count of what will be
* freed once the transaction is committed. It will be zeroed every
* time the transaction commits.
*/
struct percpu_counter total_bytes_pinned;
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
*/
u64 tickets_id;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
wait_queue_head_t wait;
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
/*
* Types of block reserves
*/
enum {
BTRFS_BLOCK_RSV_GLOBAL,
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
BTRFS_BLOCK_RSV_CHUNK,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
BTRFS_BLOCK_RSV_EMPTY,
BTRFS_BLOCK_RSV_TEMP,
};
struct btrfs_block_rsv {
u64 size;
u64 reserved;
struct btrfs_space_info *space_info;
spinlock_t lock;
unsigned short full;
unsigned short type;
unsigned short failfast;
/*
* Qgroup equivalent for @size @reserved
*
* Unlike normal @size/@reserved for inode rsv, qgroup doesn't care
* about things like csum size nor how many tree blocks it will need to
* reserve.
*
* Qgroup cares more about net change of the extent usage.
*
* So for one newly inserted file extent, in worst case it will cause
* leaf split and level increase, nodesize for each file extent is
* already too much.
*
* In short, qgroup_size/reserved is the upper limit of possible needed
* qgroup metadata reservation.
*/
u64 qgroup_rsv_size;
u64 qgroup_rsv_reserved;
};
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
@ -786,11 +686,18 @@ enum {
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
*/
BTRFS_FS_BALANCE_RUNNING,
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
/*
* The checksumming has an optimized version and is considered fast,
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
};
struct btrfs_fs_info {
@ -824,7 +731,7 @@ struct btrfs_fs_info {
struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
struct extent_map_tree mapping_tree;
/*
* block reservation for extent, checksum, root tree and
@ -1160,6 +1067,14 @@ struct btrfs_fs_info {
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
struct crypto_shash *csum_shash;
/*
* Number of send operations in progress.
* Updated while holding fs_info::balance_mutex.
*/
int send_in_progress;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@ -2451,6 +2366,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s)
return btrfs_csum_sizes[t];
}
static inline const char *btrfs_super_csum_name(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csum_names[csum_type];
}
/*
* The leaf data grows from end-to-front in the node.
@ -2642,6 +2562,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length)
{
return crc32c(crc, address, length);
}
static inline void btrfs_crc32c_final(u32 crc, u8 *result)
{
put_unaligned_le32(~crc, result);
}
static inline u64 btrfs_name_hash(const char *name, int len)
{
return crc32c((u32)~1, name, len);
@ -2656,12 +2586,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
return (u64) crc32c(parent_objectid, name, len);
}
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
@ -2698,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@ -2814,17 +2736,28 @@ enum btrfs_flush_state {
COMMIT_TRANS = 9,
};
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
/*
* control flags for do_chunk_alloc's force field
* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
* if we really need one.
*
* CHUNK_ALLOC_LIMITED means to only try and allocate one
* if we have very few chunks already allocated. This is
* used as part of the clustering code to help make sure
* we have a good pool of storage to cluster in, without
* filling the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
*
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
};
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
@ -2834,41 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
unsigned short type);
void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv,
unsigned short type);
void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
int btrfs_block_rsv_add(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
bool update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes);
int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
@ -3186,7 +3084,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u8 *dst);
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@ -3514,8 +3413,7 @@ __cold
static inline void assfail(const char *expr, const char *file, int line)
{
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
pr_err("assertion failed: %s, file: %s, line: %d\n",
expr, file, line);
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
}
}
@ -3599,10 +3497,11 @@ do { \
/* compatibility and incompatibility defines */
#define btrfs_set_fs_incompat(__fs_info, opt) \
__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
#opt)
static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
u64 flag)
u64 flag, const char* name)
{
struct btrfs_super_block *disk_super;
u64 features;
@ -3615,18 +3514,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
if (!(features & flag)) {
features |= flag;
btrfs_set_super_incompat_flags(disk_super, features);
btrfs_info(fs_info, "setting %llu feature flag",
flag);
btrfs_info(fs_info,
"setting incompat feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
#define btrfs_clear_fs_incompat(__fs_info, opt) \
__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \
#opt)
static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
u64 flag)
u64 flag, const char* name)
{
struct btrfs_super_block *disk_super;
u64 features;
@ -3639,8 +3540,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
if (features & flag) {
features &= ~flag;
btrfs_set_super_incompat_flags(disk_super, features);
btrfs_info(fs_info, "clearing %llu feature flag",
flag);
btrfs_info(fs_info,
"clearing incompat feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
@ -3657,10 +3559,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
}
#define btrfs_set_fs_compat_ro(__fs_info, opt) \
__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
#opt)
static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
u64 flag)
u64 flag, const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
@ -3673,18 +3576,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
if (!(features & flag)) {
features |= flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
btrfs_info(fs_info, "setting %llu ro feature flag",
flag);
btrfs_info(fs_info,
"setting compat-ro feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \
#opt)
static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
u64 flag)
u64 flag, const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
@ -3697,8 +3602,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
if (features & flag) {
features &= ~flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
btrfs_info(fs_info, "clearing %llu ro feature flag",
flag);
btrfs_info(fs_info,
"clearing compat-ro feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}

494
fs/btrfs/delalloc-space.c Normal file
View File

@ -0,0 +1,494 @@
// SPDX-License-Identifier: GPL-2.0
#include "ctree.h"
#include "delalloc-space.h"
#include "block-rsv.h"
#include "btrfs_inode.h"
#include "space-info.h"
#include "transaction.h"
#include "qgroup.h"
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
u64 used;
int ret = 0;
int need_commit = 2;
int have_pinned_space;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
if (btrfs_is_free_space_inode(inode)) {
need_commit = 0;
ASSERT(current->journal_info);
}
again:
/* Make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
used = btrfs_space_info_used(data_sinfo, true);
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
/*
* If we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
if (!data_sinfo->full) {
u64 alloc_target;
data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&data_sinfo->lock);
alloc_target = btrfs_data_alloc_profile(fs_info);
/*
* It is ugly that we don't call nolock join
* transaction for the free space inode case here.
* But it is safe because we only do the data space
* reservation for the free space cache in the
* transaction context, the common join transaction
* just increase the counter of the current transaction
* handler, doesn't try to acquire the trans_lock of
* the fs.
*/
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
ret = btrfs_chunk_alloc(trans, alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans);
if (ret < 0) {
if (ret != -ENOSPC)
return ret;
else {
have_pinned_space = 1;
goto commit_trans;
}
}
goto again;
}
/*
* If we don't have enough pinned space to deal with this
* allocation, and no removed chunk in current transaction,
* don't bother committing the transaction.
*/
have_pinned_space = __percpu_counter_compare(
&data_sinfo->total_bytes_pinned,
used + bytes - data_sinfo->total_bytes,
BTRFS_TOTAL_BYTES_PINNED_BATCH);
spin_unlock(&data_sinfo->lock);
/* Commit the current transaction and try again */
commit_trans:
if (need_commit) {
need_commit--;
if (need_commit > 0) {
btrfs_start_delalloc_roots(fs_info, -1);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
(u64)-1);
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
&trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
* more space is released. We don't need to
* explicitly run the delayed iputs here because
* the commit_transaction would have woken up
* the cleaner.
*/
ret = btrfs_wait_on_delayed_iputs(fs_info);
if (ret)
return ret;
goto again;
} else {
btrfs_end_transaction(trans);
}
}
trace_btrfs_space_reservation(fs_info,
"space_info:enospc",
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
return 0;
}
int btrfs_check_data_free_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
/* align the range */
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
else
ret = 0;
return ret;
}
/*
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
* This one will *NOT* use accurate qgroup reserved space API, just for case
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_space_info *data_sinfo;
/* Make sure the range is aligned to sectorsize */
len = round_up(start + len, fs_info->sectorsize) -
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
data_sinfo = fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
trace_btrfs_space_reservation(fs_info, "space_info",
data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
/*
* Called if we need to clear a data reservation for this inode
* Normally in a error case.
*
* This one will handle the per-inode data rsv map for accurate reserved
* space framework.
*/
void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
/* Make sure the range is aligned to sectorsize */
len = round_up(start + len, root->fs_info->sectorsize) -
round_down(start, root->fs_info->sectorsize);
start = round_down(start, root->fs_info->sectorsize);
btrfs_free_reserved_data_space_noquota(inode, start, len);
btrfs_qgroup_free_data(inode, reserved, start, len);
}
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
* @qgroup_free - free or convert qgroup meta.
* Unlike normal operation, qgroup meta reservation needs to know if we are
* freeing qgroup reservation or just converting it into per-trans. Normally
* @qgroup_free is true for error handling, and false for normal release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 released = 0;
u64 qgroup_to_release = 0;
/*
* Since we statically set the block_rsv->size we just want to say we
* are releasing 0 bytes, and then we'll just get the reservation over
* the size free'd.
*/
released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
&qgroup_to_release);
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
if (qgroup_free)
btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
else
btrfs_qgroup_convert_reserved_meta(inode->root,
qgroup_to_release);
}
static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
outstanding_extents = inode->outstanding_extents;
if (outstanding_extents)
reserve_size = btrfs_calc_trans_metadata_size(fs_info,
outstanding_extents + 1);
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
inode->csum_bytes);
reserve_size += btrfs_calc_trans_metadata_size(fs_info,
csum_leaves);
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
*
* This is overestimating in most cases.
*/
qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
spin_lock(&block_rsv->lock);
block_rsv->size = reserve_size;
block_rsv->qgroup_rsv_size = qgroup_rsv_size;
spin_unlock(&block_rsv->lock);
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
u64 num_bytes, u64 *meta_reserve,
u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
/* We add one for the inode update at finish ordered time */
*meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
nr_extents + csum_leaves + 1);
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 meta_reserve, qgroup_reserve;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
/*
* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
* mutex since we won't race with anybody. We need this mostly to make
* lockdep shut its filthy mouth.
*
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
if (btrfs_transaction_in_commit(fs_info))
schedule_timeout(1);
}
if (delalloc_lock)
mutex_lock(&inode->delalloc_mutex);
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
* in tears. Pre-reserving the amount we are going to add will always
* be the right way, because otherwise if we have enough parallelism we
* could end up with thousands of inodes all holding little bits of
* reservations they were able to make previously and the only way to
* reclaim that space is to ENOSPC out the operations and clear
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
&qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
goto out_fail;
ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
if (ret)
goto out_qgroup;
/*
* Now we need to update our outstanding extents and csum bytes _first_
* and then add the reservation to the block_rsv. This keeps us from
* racing with an ordered completion or some such that would think it
* needs to free the reservation we just made.
*/
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
inode->csum_bytes += num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
/* Now we can safely add our space to our block rsv */
btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), meta_reserve, 1);
spin_lock(&block_rsv->lock);
block_rsv->qgroup_rsv_reserved += qgroup_reserve;
spin_unlock(&block_rsv->lock);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return 0;
out_qgroup:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
out_fail:
btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
}
/**
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
* reservations, or on error for the same reason.
*/
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
if (btrfs_is_testing(fs_info))
return;
btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
* @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
* have outstanding_extents to track the real usage, so we use this to free our
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned num_extents;
spin_lock(&inode->lock);
num_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, -num_extents);
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
if (btrfs_is_testing(fs_info))
return;
btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_reserve_space - reserve data and metadata space for
* delalloc
* @inode: inode we're writing to
* @start: start range we are writing to
* @len: how long the range we are writing to
* @reserved: mandatory parameter, record actually reserved qgroup ranges of
* current reservation.
*
* This will do the following things
*
* - reserve space in data space info for num bytes
* and reserve precious corresponding qgroup space
* (Done in check_data_free_space)
*
* - reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
* also reserve metadata space in a per root over-reserve method.
* - add to the inodes->delalloc_bytes
* - add it to the fs_info's delalloc inodes list.
* (Above 3 all done in delalloc_reserve_metadata)
*
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
*/
int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
{
int ret;
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
if (ret < 0)
btrfs_free_reserved_data_space(inode, *reserved, start, len);
return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
* @start: start position of the space already reserved
* @len: the len of the space already reserved
* @release_bytes: the len of the space we consumed or didn't use
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
*/
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free)
{
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}

23
fs/btrfs/delalloc-space.h Normal file
View File

@ -0,0 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_DELALLOC_SPACE_H
#define BTRFS_DELALLOC_SPACE_H
struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
#endif /* BTRFS_DELALLOC_SPACE_H */

View File

@ -10,6 +10,7 @@
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep;
* of hammering updates on the extent allocation tree.
*/
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
bool ret = false;
u64 reserved;
spin_lock(&global_rsv->lock);
reserved = global_rsv->reserved;
spin_unlock(&global_rsv->lock);
/*
* Since the global reserve is just kind of magic we don't really want
* to rely on it to save our bacon, so if our size is more than the
* delayed_refs_rsv and the global rsv then it's time to think about
* bailing.
*/
spin_lock(&delayed_refs_rsv->lock);
reserved += delayed_refs_rsv->reserved;
if (delayed_refs_rsv->size >= reserved)
ret = true;
spin_unlock(&delayed_refs_rsv->lock);
return ret;
}
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
u64 avg_runtime;
u64 val;
smp_mb();
avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
/**
* btrfs_delayed_refs_rsv_release - release a ref head's reservation.
* @fs_info - the fs_info for our fs.
* @nr - the number of items to drop.
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
*/
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
u64 released = 0;
released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes,
NULL);
if (released)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, released, 0);
}
/*
* btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
* @trans - the trans that may have generated delayed refs
*
* This is to be called anytime we may have adjusted trans->delayed_ref_updates,
* it'll calculate the additional size and add it to the delayed_refs_rsv.
*/
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
u64 num_bytes;
if (!trans->delayed_ref_updates)
return;
num_bytes = btrfs_calc_trans_metadata_size(fs_info,
trans->delayed_ref_updates);
spin_lock(&delayed_rsv->lock);
delayed_rsv->size += num_bytes;
delayed_rsv->full = 0;
spin_unlock(&delayed_rsv->lock);
trans->delayed_ref_updates = 0;
}
/**
* btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
* @fs_info - the fs info for our fs.
* @src - the source block rsv to transfer from.
* @num_bytes - the number of bytes to transfer.
*
* This transfers up to the num_bytes amount from the src rsv to the
* delayed_refs_rsv. Any extra bytes are returned to the space info.
*/
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes)
{
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
u64 to_free = 0;
spin_lock(&src->lock);
src->reserved -= num_bytes;
src->size -= num_bytes;
spin_unlock(&src->lock);
spin_lock(&delayed_refs_rsv->lock);
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
u64 delta = delayed_refs_rsv->size -
delayed_refs_rsv->reserved;
if (num_bytes > delta) {
to_free = num_bytes - delta;
num_bytes = delta;
}
} else {
to_free = num_bytes;
num_bytes = 0;
}
if (num_bytes)
delayed_refs_rsv->reserved += num_bytes;
if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
delayed_refs_rsv->full = 1;
spin_unlock(&delayed_refs_rsv->lock);
if (num_bytes)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, num_bytes, 1);
if (to_free)
btrfs_space_info_add_old_bytes(fs_info,
delayed_refs_rsv->space_info, to_free);
}
/**
* btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
* @fs_info - the fs_info for our fs.
* @flush - control how we can flush for this reservation.
*
* This will refill the delayed block_rsv up to 1 items size worth of space and
* will return -ENOSPC if we can't make the reservation.
*/
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush)
{
struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
u64 num_bytes = 0;
int ret = -ENOSPC;
spin_lock(&block_rsv->lock);
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
num_bytes = min(num_bytes, limit);
}
spin_unlock(&block_rsv->lock);
if (!num_bytes)
return 0;
ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv,
num_bytes, flush);
if (ret)
return ret;
btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
0, num_bytes, 1);
return 0;
}
/*
* compare two delayed tree backrefs with same bytenr and type
*/
@ -957,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
}
/*
* this does a simple search for the head node for a given extent.
* It must be called with the delayed ref spinlock held, and it returns
* the head node if any where found, or NULL if not.
* This does a simple search for the head node for a given extent. Returns the
* head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
lockdep_assert_held(&delayed_refs->lock);
return find_ref_head(delayed_refs, bytenr, false);
}

View File

@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *src,
u64 num_bytes);
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
/*
* helper functions to cast a node into its container
*/

View File

@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return PTR_ERR(bdev);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
sync_blockdev(bdev);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
rcu_assign_pointer(device->name, name);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
@ -399,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
bool need_unlock;
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name);
@ -413,11 +413,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return -ETXTBSY;
}
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
return ret;
/*
* Here we commit the transaction to make sure commit_total_bytes
* of all the devices are updated.
@ -431,7 +426,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
need_unlock = true;
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
return ret;
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@ -442,11 +441,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
}
dev_replace->cont_reading_from_srcdev_mode = read_src;
WARN_ON(!src_device);
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@ -479,16 +477,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
/* Commit dev_replace state and reserve 1 item for it. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
need_unlock = true;
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
up_write(&dev_replace->rwsem);
goto leave;
}
@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
if (need_unlock)
up_write(&dev_replace->rwsem);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@ -678,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
tgt_device->commit_total_bytes = src_device->commit_total_bytes;
tgt_device->commit_bytes_used = src_device->bytes_used;
btrfs_assign_next_active_device(src_device, tgt_device);
@ -728,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
u64 start = 0;

View File

@ -19,6 +19,7 @@
#include <linux/crc32c.h>
#include <linux/sched/mm.h>
#include <asm/unaligned.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -40,10 +41,6 @@
#include "tree-checker.h"
#include "ref-verify.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
#endif
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
BTRFS_SUPER_FLAG_ERROR |\
@ -249,16 +246,6 @@ out:
return em;
}
u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
{
return crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, u8 *result)
{
put_unaligned_le32(~crc, result);
}
/*
* Compute the csum of a btree block and store the result to provided buffer.
*
@ -266,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result)
*/
static int csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
@ -273,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result)
unsigned long map_start;
unsigned long map_len;
int err;
u32 crc = ~(u32)0;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
len = buf->len - offset;
while (len > 0) {
/*
* Note: we don't need to check for the err == 1 case here, as
@ -288,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result)
if (WARN_ON(err))
return err;
cur_len = min(len, map_len - (offset - map_start));
crc = btrfs_csum_data(kaddr + offset - map_start,
crc, cur_len);
crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
len -= cur_len;
offset += cur_len;
}
memset(result, 0, BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
crypto_shash_final(shash, result);
return 0;
}
@ -356,6 +347,16 @@ out:
return ret;
}
static bool btrfs_supported_super_csum(u16 csum_type)
{
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
return true;
default:
return false;
}
}
/*
* Return 0 if the superblock checksum type matches the checksum value of that
* algorithm. Pass the raw disk superblock data.
@ -365,33 +366,25 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
{
struct btrfs_super_block *disk_sb =
(struct btrfs_super_block *)raw_disk_sb;
u16 csum_type = btrfs_super_csum_type(disk_sb);
int ret = 0;
char result[BTRFS_CSUM_SIZE];
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
u32 crc = ~(u32)0;
char result[sizeof(crc)];
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
* is filled with zeros and is included in the checksum.
*/
crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
/*
* The super_block structure does not span the whole
* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
* filled with zeros and is included in the checksum.
*/
crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
crypto_shash_final(shash, result);
if (memcmp(raw_disk_sb, result, sizeof(result)))
ret = 1;
}
if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
return 1;
if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
btrfs_err(fs_info, "unsupported checksum algorithm %u",
csum_type);
ret = 1;
}
return ret;
return 0;
}
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
@ -873,14 +866,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
return btree_csum_one_bio(bio);
}
static int check_async_write(struct btrfs_inode *bi)
static int check_async_write(struct btrfs_fs_info *fs_info,
struct btrfs_inode *bi)
{
if (atomic_read(&bi->sync_writers))
return 0;
#ifdef CONFIG_X86
if (static_cpu_has(X86_FEATURE_XMM4_2))
if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
return 0;
#endif
return 1;
}
@ -889,7 +881,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(BTRFS_I(inode));
int async = check_async_write(fs_info, BTRFS_I(inode));
blk_status_t ret;
if (bio_op(bio) != REQ_OP_WRITE) {
@ -2262,6 +2254,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
return 0;
}
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
struct crypto_shash *csum_shash;
const char *csum_name = btrfs_super_csum_name(csum_type);
csum_shash = crypto_alloc_shash(csum_name, 0, 0);
if (IS_ERR(csum_shash)) {
btrfs_err(fs_info, "error allocating %s hash for checksum",
csum_name);
return PTR_ERR(csum_shash);
}
fs_info->csum_shash = csum_shash;
return 0;
}
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
crypto_free_shash(fs_info->csum_shash);
}
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
@ -2577,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
ret = validate_super(fs_info, sb, -1);
if (ret < 0)
goto out;
if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
ret = -EUCLEAN;
btrfs_err(fs_info, "invalid csum type, has %u want %u",
btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
@ -2607,6 +2622,7 @@ int open_ctree(struct super_block *sb,
u32 stripesize;
u64 generation;
u64 features;
u16 csum_type;
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
@ -2689,7 +2705,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree);
extent_map_tree_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@ -2793,6 +2809,8 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->swapfile_pins_lock);
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@ -2812,6 +2830,25 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
/*
* Verify the type first, if that or the the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
if (!btrfs_supported_super_csum(csum_type)) {
btrfs_err(fs_info, "unsupported checksum algorithm: %u",
csum_type);
err = -EINVAL;
brelse(bh);
goto fail_alloc;
}
ret = btrfs_init_csum_hash(fs_info, csum_type);
if (ret) {
err = ret;
goto fail_alloc;
}
/*
* We want to check superblock checksum, the type is stored inside.
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
@ -2820,7 +2857,7 @@ int open_ctree(struct super_block *sb,
btrfs_err(fs_info, "superblock checksum mismatch");
err = -EINVAL;
brelse(bh);
goto fail_alloc;
goto fail_csum;
}
/*
@ -2857,11 +2894,11 @@ int open_ctree(struct super_block *sb,
if (ret) {
btrfs_err(fs_info, "superblock contains fatal errors");
err = -EINVAL;
goto fail_alloc;
goto fail_csum;
}
if (!btrfs_super_root(disk_super))
goto fail_alloc;
goto fail_csum;
/* check FS state, whether FS is broken. */
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
@ -2883,7 +2920,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
goto fail_alloc;
goto fail_csum;
}
features = btrfs_super_incompat_flags(disk_super) &
@ -2893,7 +2930,7 @@ int open_ctree(struct super_block *sb,
"cannot mount because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
goto fail_csum;
}
features = btrfs_super_incompat_flags(disk_super);
@ -2937,7 +2974,7 @@ int open_ctree(struct super_block *sb,
btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
nodesize, sectorsize);
goto fail_alloc;
goto fail_csum;
}
/*
@ -2953,7 +2990,7 @@ int open_ctree(struct super_block *sb,
"cannot mount read-write because of unsupported optional features (%llx)",
features);
err = -EINVAL;
goto fail_alloc;
goto fail_csum;
}
ret = btrfs_init_workqueues(fs_info, fs_devices);
@ -3331,6 +3368,8 @@ fail_tree_roots:
fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_csum:
btrfs_free_csum_hash(fs_info);
fail_alloc:
fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@ -3472,17 +3511,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
static int write_dev_supers(struct btrfs_device *device,
struct btrfs_super_block *sb, int max_mirrors)
{
struct btrfs_fs_info *fs_info = device->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct buffer_head *bh;
int i;
int ret;
int errors = 0;
u32 crc;
u64 bytenr;
int op_flags;
if (max_mirrors == 0)
max_mirrors = BTRFS_SUPER_MIRROR_MAX;
shash->tfm = fs_info->csum_shash;
for (i = 0; i < max_mirrors; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
@ -3491,10 +3533,10 @@ static int write_dev_supers(struct btrfs_device *device,
btrfs_set_super_bytenr(sb, bytenr);
crc = ~(u32)0;
crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, sb->csum);
crypto_shash_init(shash);
crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
crypto_shash_final(shash, sb->csum);
/* One reference for us, and we leave it for the caller */
bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
@ -3709,7 +3751,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
min_tolerated = min(min_tolerated,
min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[BTRFS_RAID_SINGLE].
tolerated_failures);
@ -3718,7 +3760,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
continue;
if (!(flags & btrfs_raid_array[raid_type].bg_flag))
continue;
min_tolerated = min(min_tolerated,
min_tolerated = min_t(int, min_tolerated,
btrfs_raid_array[raid_type].
tolerated_failures);
}

View File

@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key);
u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, u8 *result);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,

File diff suppressed because it is too large Load Diff

View File

@ -359,6 +359,24 @@ do_insert:
return NULL;
}
/**
* __etree_search - searche @tree for an entry that contains @offset. Such
* entry would have entry->start <= offset && entry->end >= offset.
*
* @tree - the tree to search
* @offset - offset that should fall within an entry in @tree
* @next_ret - pointer to the first entry whose range ends after @offset
* @prev - pointer to the first entry whose range begins before @offset
* @p_ret - pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret - points to entry which would have been the parent of the entry,
* containing @offset
*
* This function returns a pointer to the entry that contains @offset byte
* address. If no such entry exists, then NULL is returned and the other
* pointer arguments to the function are filled, otherwise the found entry is
* returned and other pointers are left untouched.
*/
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
struct rb_node **next_ret,
struct rb_node **prev_ret,
@ -504,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree,
{
struct rb_node *node;
if (end < start)
WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
end, start);
if (end < start) {
btrfs_err(tree->fs_info,
"insert state: end < start %llu %llu", end, start);
WARN_ON(1);
}
state->start = start;
state->end = end;
@ -516,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree,
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
btrfs_err(tree->fs_info,
"found node %llu %llu on insert of %llu %llu",
found->start, found->end, start, end);
return -EEXIST;
}
@ -1537,8 +1558,8 @@ out:
}
/**
* find_first_clear_extent_bit - finds the first range that has @bits not set
* and that starts after @start
* find_first_clear_extent_bit - find the first range that has @bits not set.
* This range could start before @start.
*
* @tree - the tree to search
* @start - the offset at/after which the found extent should start
@ -1578,12 +1599,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
goto out;
}
}
/*
* At this point 'node' either contains 'start' or start is
* before 'node'
*/
state = rb_entry(node, struct extent_state, rb_node);
if (in_range(start, state->start, state->end - state->start + 1) &&
(state->state & bits)) {
start = state->end + 1;
if (in_range(start, state->start, state->end - state->start + 1)) {
if (state->state & bits) {
/*
* |--range with bits sets--|
* |
* start
*/
start = state->end + 1;
} else {
/*
* 'start' falls within a range that doesn't
* have the bits set, so take its start as
* the beginning of the desired range
*
* |--range with bits cleared----|
* |
* start
*/
*start_ret = state->start;
break;
}
} else {
*start_ret = start;
/*
* |---prev range---|---hole/unset---|---node range---|
* |
* start
*
* or
*
* |---hole/unset--||--first node--|
* 0 |
* start
*/
if (prev) {
state = rb_entry(prev, struct extent_state,
rb_node);
*start_ret = state->end + 1;
} else {
*start_ret = 0;
}
break;
}
}
@ -1719,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*/
EXPORT_FOR_TESTS
noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
@ -2800,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
* never fail. We're returning a bio right now but you can call btrfs_io_bio
* for the appropriate container_of magic
*/
struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
struct bio *btrfs_bio_alloc(u64 first_byte)
{
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = first_byte >> 9;
btrfs_io_bio_init(btrfs_io_bio(bio));
return bio;
@ -2916,7 +2976,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
}
}
bio = btrfs_bio_alloc(bdev, offset);
bio = btrfs_bio_alloc(offset);
bio_set_dev(bio, bdev);
bio_add_page(bio, page, page_size, pg_offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
@ -3204,21 +3265,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree,
unsigned long *bio_flags,
u64 *prev_em_start)
{
struct inode *inode;
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
int index;
inode = pages[0]->mapping->host;
while (1) {
lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
end - start + 1);
if (!ordered)
break;
unlock_extent(tree, start, end);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
@ -3234,22 +3284,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
unsigned long *bio_flags,
unsigned int read_flags)
{
struct inode *inode = page->mapping->host;
struct btrfs_ordered_extent *ordered;
struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
u64 start = page_offset(page);
u64 end = start + PAGE_SIZE - 1;
int ret;
while (1) {
lock_extent(tree, start, end);
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
PAGE_SIZE);
if (!ordered)
break;
unlock_extent(tree, start, end);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
bio_flags, read_flags, NULL);
@ -3290,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
struct page *page, struct writeback_control *wbc,
u64 delalloc_start, unsigned long *nr_written)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
u64 page_end = delalloc_start + PAGE_SIZE - 1;
bool found;
u64 delalloc_to_write = 0;
@ -3300,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
while (delalloc_end < page_end) {
found = find_lock_delalloc_range(inode, tree,
page,
found = find_lock_delalloc_range(inode, page,
&delalloc_start,
&delalloc_end);
if (!found) {
@ -3310,7 +3348,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
}
ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
delalloc_end, &page_started, nr_written, wbc);
/* File system has been set read-only */
if (ret) {
SetPageError(page);
/*
@ -4542,6 +4579,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
@ -4555,6 +4594,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
roots = ulist_alloc(GFP_KERNEL);
tmp_ulist = ulist_alloc(GFP_KERNEL);
if (!roots || !tmp_ulist) {
ret = -ENOMEM;
goto out_free_ulist;
}
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
@ -4565,8 +4611,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ret = btrfs_lookup_file_extent(NULL, root, path,
btrfs_ino(BTRFS_I(inode)), -1, 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
goto out_free_ulist;
} else {
WARN_ON(!ret);
if (ret == 1)
@ -4675,7 +4720,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
ret = btrfs_check_shared(root,
btrfs_ino(BTRFS_I(inode)),
bytenr);
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
if (ret)
@ -4718,9 +4763,13 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
btrfs_free_path(path);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
btrfs_free_path(path);
ulist_free(roots);
ulist_free(tmp_ulist);
return ret;
}
@ -4808,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
eb->bflags = 0;
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
atomic_set(&eb->blocking_writers, 0);
eb->blocking_writers = 0;
eb->lock_nested = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@ -4827,10 +4876,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
#ifdef CONFIG_BTRFS_DEBUG
atomic_set(&eb->spinning_writers, 0);
eb->spinning_writers = 0;
atomic_set(&eb->spinning_readers, 0);
atomic_set(&eb->read_locks, 0);
atomic_set(&eb->write_locks, 0);
eb->write_locks = 0;
#endif
return eb;

View File

@ -167,7 +167,7 @@ struct extent_buffer {
struct rcu_head rcu_head;
pid_t lock_owner;
atomic_t blocking_writers;
int blocking_writers;
atomic_t blocking_readers;
bool lock_nested;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
@ -187,10 +187,10 @@ struct extent_buffer {
wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
atomic_t spinning_writers;
int spinning_writers;
atomic_t spinning_readers;
atomic_t read_locks;
atomic_t write_locks;
int write_locks;
struct list_head leak_list;
#endif
};
@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
u64 delalloc_end, struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte);
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree,
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end);
#endif

View File

@ -8,6 +8,7 @@
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -22,9 +23,13 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
sizeof(u32) * (fs_info)->sectorsize)
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
{
u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
return ncsums * fs_info->sectorsize;
}
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u64 logical_offset, u32 *dst, int dio)
u64 logical_offset, u8 *dst, int dio)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct bio_vec bvec;
@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
}
csum = btrfs_bio->csum;
} else {
csum = (u8 *)dst;
csum = dst;
}
if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio
if (!dio)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
(u32 *)csum, nblocks);
csum, nblocks);
if (count)
goto found;
@ -283,7 +288,8 @@ next:
return 0;
}
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
u8 *dst)
{
return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
}
@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_csum_item);
while (start < csum_end) {
size = min_t(size_t, csum_end - start,
MAX_ORDERED_SUM_BYTES(fs_info));
max_ordered_sum_bytes(fs_info, csum_size));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
@ -427,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
u64 file_start, int contig)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
char *data;
@ -439,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
int i;
u64 offset;
unsigned nofs_flag;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
nofs_flag = memalloc_nofs_save();
sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
@ -459,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
if (!contig)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
@ -498,17 +508,14 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
index = 0;
}
sums->sums[index] = ~(u32)0;
crypto_shash_init(shash);
data = kmap_atomic(bvec.bv_page);
sums->sums[index]
= btrfs_csum_data(data + bvec.bv_offset
+ (i * fs_info->sectorsize),
sums->sums[index],
fs_info->sectorsize);
crypto_shash_update(shash, data + bvec.bv_offset
+ (i * fs_info->sectorsize),
fs_info->sectorsize);
kunmap_atomic(data);
btrfs_csum_final(sums->sums[index],
(char *)(sums->sums + index));
index++;
crypto_shash_final(shash, (char *)(sums->sums + index));
index += csum_size;
offset += fs_info->sectorsize;
this_sum_bytes += fs_info->sectorsize;
total_bytes += fs_info->sectorsize;
@ -904,9 +911,9 @@ found:
write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
ins_size);
index += ins_size;
ins_size /= csum_size;
total_bytes += ins_size * fs_info->sectorsize;
index += ins_size;
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {

View File

@ -26,6 +26,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@ -1550,30 +1551,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct btrfs_ordered_extent *ordered;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
ret = btrfs_start_write_no_snapshotting(root);
if (!ret)
return -ENOSPC;
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
while (1) {
lock_extent(&inode->io_tree, lockstart, lockend);
ordered = btrfs_lookup_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (!ordered) {
break;
}
unlock_extent(&inode->io_tree, lockstart, lockend);
btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
lockend, NULL);
num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
@ -2721,6 +2712,11 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
struct timespec64 now = current_time(inode);
inode_inc_iversion(inode);
inode->i_mtime = now;
inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@ -2801,9 +2797,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
}
enum {
RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
RANGE_BOUNDARY_HOLE = 2,
RANGE_BOUNDARY_WRITTEN_EXTENT,
RANGE_BOUNDARY_PREALLOC_EXTENT,
RANGE_BOUNDARY_HOLE,
};
static int btrfs_zero_range_check_range_boundary(struct inode *inode,

View File

@ -18,6 +18,8 @@
#include "extent_io.h"
#include "inode-map.h"
#include "volumes.h"
#include "space-info.h"
#include "delalloc-space.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_32K
@ -465,9 +467,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
if (index == 0)
offset = sizeof(u32) * io_ctl->num_pages;
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_SIZE - offset);
btrfs_csum_final(crc, (u8 *)&crc);
crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
tmp += index;
@ -493,9 +494,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
val = *tmp;
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
PAGE_SIZE - offset);
btrfs_csum_final(crc, (u8 *)&crc);
crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
btrfs_crc32c_final(crc, (u8 *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->fs_info,
"csum mismatch on free space cache");
@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
space_info->bytes_readonly += reserved_bytes;
block_group->reserved -= reserved_bytes;
space_info->bytes_reserved -= reserved_bytes;
spin_unlock(&space_info->lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
}
return ret;
@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
if (cleanup) {
mutex_lock(&fs_info->chunk_mutex);
em_tree = &fs_info->mapping_tree.map_tree;
em_tree = &fs_info->mapping_tree;
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, block_group->key.objectid,
1);

View File

@ -11,6 +11,7 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "transaction.h"
#include "delalloc-space.h"
static int caching_kthread(void *data)
{

View File

@ -47,6 +47,7 @@
#include "props.h"
#include "qgroup.h"
#include "dedupe.h"
#include "delalloc-space.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@ -1932,17 +1933,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
u64 length = 0;
u64 map_length;
int ret;
struct btrfs_io_geometry geom;
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
length = bio->bi_iter.bi_size;
map_length = length;
ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
NULL, 0);
ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
&geom);
if (ret < 0)
return ret;
if (map_length < length + size)
if (geom.len < length + size)
return 1;
return 0;
}
@ -3203,16 +3206,23 @@ static int __readpage_endio_check(struct inode *inode,
int icsum, struct page *page,
int pgoff, u64 start, size_t len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
u32 csum_expected;
u32 csum = ~(u32)0;
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
csum_expected = *(((u32 *)io_bio->csum) + icsum);
csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(kaddr + pgoff, csum, len);
btrfs_csum_final(csum, (u8 *)&csum);
if (csum != csum_expected)
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
crypto_shash_update(shash, kaddr + pgoff, len);
crypto_shash_final(shash, csum);
if (memcmp(csum, csum_expected, csum_size))
goto zeroit;
kunmap_atomic(kaddr);
@ -3286,6 +3296,28 @@ void btrfs_add_delayed_iput(struct inode *inode)
wake_up_process(fs_info->cleaner_kthread);
}
static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
}
static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
struct btrfs_inode *inode)
{
if (!list_empty(&inode->delayed_iput)) {
spin_lock(&fs_info->delayed_iput_lock);
if (!list_empty(&inode->delayed_iput))
run_delayed_iput_locked(fs_info, inode);
spin_unlock(&fs_info->delayed_iput_lock);
}
}
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
{
@ -3295,12 +3327,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
wake_up(&fs_info->delayed_iputs_wait);
spin_lock(&fs_info->delayed_iput_lock);
run_delayed_iput_locked(fs_info, inode);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
@ -3935,9 +3962,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
int ret = 0;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
u64 index;
u64 ino = btrfs_ino(inode);
u64 dir_ino = btrfs_ino(dir);
@ -3955,8 +3980,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto err;
@ -4009,6 +4032,17 @@ skip_backref:
ret = 0;
else if (ret)
btrfs_abort_transaction(trans, ret);
/*
* If we have a pending delayed iput we could end up with the final iput
* being run in btrfs-cleaner context. If we have enough of these built
* up we can end up burning a lot of time in btrfs-cleaner without any
* way to throttle the unlinks. Since we're currently holding a ref on
* the inode we can run the delayed iput here without any issues as the
* final iput won't be done until after we drop the ref we're currently
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
err:
btrfs_free_path(path);
if (ret)
@ -5008,21 +5042,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
if (size <= hole_start)
return 0;
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
block_end - hole_start);
if (!ordered)
break;
unlock_extent_cached(io_tree, hole_start, block_end - 1,
&cached_state);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start,
block_end - 1, &cached_state);
cur_offset = hole_start;
while (1) {
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
@ -8318,22 +8339,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
struct bio *orig_bio = dip->orig_bio;
u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
u64 map_length;
int async_submit = 0;
u64 submit_len;
int clone_offset = 0;
int clone_len;
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
map_length = orig_bio->bi_iter.bi_size;
submit_len = map_length;
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
&map_length, NULL, 0);
submit_len = orig_bio->bi_iter.bi_size;
ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
start_sector << 9, submit_len, &geom);
if (ret)
return -EIO;
if (map_length >= submit_len) {
if (geom.len >= submit_len) {
bio = orig_bio;
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
goto submit;
@ -8346,10 +8366,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
async_submit = 1;
/* bio split */
ASSERT(map_length <= INT_MAX);
ASSERT(geom.len <= INT_MAX);
atomic_inc(&dip->pending_bios);
do {
clone_len = min_t(int, submit_len, map_length);
clone_len = min_t(int, submit_len, geom.len);
/*
* This will never fail as it's passing GPF_NOFS and
@ -8386,9 +8406,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
start_sector += clone_len >> 9;
file_offset += clone_len;
map_length = submit_len;
ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
start_sector << 9, &map_length, NULL, 0);
ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
start_sector << 9, submit_len, &geom);
if (ret)
goto out_err;
} while (submit_len > 0);

View File

@ -43,6 +43,8 @@
#include "qgroup.h"
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
#include "delalloc-space.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@ -3993,6 +3995,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (!same_inode)
inode_dio_wait(inode_out);
/*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
* work at the whole extent level.
* NOCOW buffered write without data space reserved may not be able
* to fall back to CoW due to lack of data space, thus could cause
* data loss.
*
* Here we take a shortcut by flushing the whole inode, so that all
* nocow write should reach disk as nocow before we increase the
* reference of the extent. We could do better by only flushing NOCOW
* data, but that needs extra accounting.
*
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
ret = filemap_flush(inode_in->i_mapping);
if (ret < 0)
return ret;
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)

View File

@ -15,19 +15,19 @@
#ifdef CONFIG_BTRFS_DEBUG
static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
WARN_ON(eb->spinning_writers);
eb->spinning_writers++;
}
static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
atomic_dec(&eb->spinning_writers);
WARN_ON(eb->spinning_writers != 1);
eb->spinning_writers--;
}
static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->spinning_writers));
WARN_ON(eb->spinning_writers);
}
static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
{
atomic_inc(&eb->write_locks);
eb->write_locks++;
}
static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
{
atomic_dec(&eb->write_locks);
eb->write_locks--;
}
void btrfs_assert_tree_locked(struct extent_buffer *eb)
{
BUG_ON(!atomic_read(&eb->write_locks));
BUG_ON(!eb->write_locks);
}
#else
@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
if (atomic_read(&eb->blocking_writers) == 0) {
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
btrfs_assert_tree_locked(eb);
atomic_inc(&eb->blocking_writers);
eb->blocking_writers++;
write_unlock(&eb->lock);
}
}
@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb)
*/
if (eb->lock_nested && current->pid == eb->lock_owner)
return;
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
BUG_ON(eb->blocking_writers != 1);
btrfs_assert_spinning_writers_get(eb);
/* atomic_dec_and_test implies a barrier */
if (atomic_dec_and_test(&eb->blocking_writers))
cond_wake_up_nomb(&eb->write_lock_wq);
if (--eb->blocking_writers == 0)
cond_wake_up(&eb->write_lock_wq);
}
/*
@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
if (trace_btrfs_tree_read_lock_enabled())
start_ns = ktime_get_ns();
again:
BUG_ON(!atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner);
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
BUG_ON(eb->blocking_writers == 0 &&
current->pid == eb->lock_owner);
if (eb->blocking_writers && current->pid == eb->lock_owner) {
/*
* This extent is already write-locked by our thread. We allow
* an additional read lock to be added because it's for the same
@ -185,10 +182,10 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
return;
}
if (atomic_read(&eb->blocking_writers)) {
if (eb->blocking_writers) {
read_unlock(&eb->lock);
wait_event(eb->write_lock_wq,
atomic_read(&eb->blocking_writers) == 0);
eb->blocking_writers == 0);
goto again;
}
btrfs_assert_tree_read_locks_get(eb);
@ -203,11 +200,11 @@ again:
*/
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
{
if (atomic_read(&eb->blocking_writers))
if (eb->blocking_writers)
return 0;
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers)) {
if (eb->blocking_writers) {
read_unlock(&eb->lock);
return 0;
}
@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
*/
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (atomic_read(&eb->blocking_writers))
if (eb->blocking_writers)
return 0;
if (!read_trylock(&eb->lock))
return 0;
if (atomic_read(&eb->blocking_writers)) {
if (eb->blocking_writers) {
read_unlock(&eb->lock);
return 0;
}
@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
if (eb->blocking_writers || atomic_read(&eb->blocking_readers))
return 0;
write_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
return 0;
}
@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb)
WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
wait_event(eb->write_lock_wq, eb->blocking_writers == 0);
write_lock(&eb->lock);
if (atomic_read(&eb->blocking_readers) ||
atomic_read(&eb->blocking_writers)) {
if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) {
write_unlock(&eb->lock);
goto again;
}
@ -340,7 +334,7 @@ again:
*/
void btrfs_tree_unlock(struct extent_buffer *eb)
{
int blockers = atomic_read(&eb->blocking_writers);
int blockers = eb->blocking_writers;
BUG_ON(blockers > 1);
@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
btrfs_assert_no_spinning_writers(eb);
atomic_dec(&eb->blocking_writers);
eb->blocking_writers--;
/* Use the lighter barrier after atomic */
smp_mb__after_atomic();
cond_wake_up_nomb(&eb->write_lock_wq);

View File

@ -13,6 +13,7 @@
#include "extent_io.h"
#include "disk-io.h"
#include "compression.h"
#include "delalloc-space.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@ -924,14 +925,16 @@ out:
* be reclaimed before their checksum is actually put into the btree
*/
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len)
u8 *sum, int len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
ordered = btrfs_lookup_ordered_extent(inode, offset);
@ -947,10 +950,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
num_sectors = ordered_sum->len >>
inode->i_sb->s_blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i,
num_sectors);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
index += (int)num_sectors;
index += (int)num_sectors * csum_size;
if (index == len)
goto out;
disk_bytenr += num_sectors * sectorsize;
@ -962,6 +965,51 @@ out:
return index;
}
/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
*
* @tree: IO tree used for locking out other users of the range
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
* @cached_state: If passed, will return the extent state responsible for the
* locked range. It's the caller's responsibility to free the cached state.
*
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state)
{
struct btrfs_ordered_extent *ordered;
struct extent_state *cachedp = NULL;
if (cached_state)
cachedp = *cached_state;
while (1) {
lock_extent_bits(tree, start, end, &cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
/*
* If no external cached_state has been passed then
* decrement the extra ref taken for cachedp since we
* aren't exposing it outside of this function
*/
if (!cached_state)
refcount_dec(&cachedp->refs);
break;
}
unlock_extent_cached(tree, start, end, &cachedp);
btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",

View File

@ -23,7 +23,7 @@ struct btrfs_ordered_sum {
int len;
struct list_head list;
/* last field is a variable length array of csums */
u32 sums[];
u8 sums[];
};
/*
@ -183,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree,
struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
int __init ordered_data_init(void);
void __cold ordered_data_exit(void);

View File

@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb)
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info,
"refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u",
atomic_read(&eb->refs), atomic_read(&eb->write_locks),
atomic_read(&eb->refs), eb->write_locks,
atomic_read(&eb->read_locks),
atomic_read(&eb->blocking_writers),
eb->blocking_writers,
atomic_read(&eb->blocking_readers),
atomic_read(&eb->spinning_writers),
eb->spinning_writers,
atomic_read(&eb->spinning_readers),
eb->lock_owner, current->pid);
#endif

View File

@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len)
if (!value)
return 0;
if (!strncmp("lzo", value, 3))
return 0;
else if (!strncmp("zlib", value, 4))
return 0;
else if (!strncmp("zstd", value, 4))
if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
@ -341,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans,
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
const struct prop_handler *h = &prop_handlers[i];
const char *value;
u64 num_bytes;
u64 num_bytes = 0;
if (!h->inheritable)
continue;

View File

@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
int ret = 0;
int i;
u64 *i_qgroups;
bool committing = false;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u32 level_size = 0;
u64 nums;
mutex_lock(&fs_info->qgroup_ioctl_lock);
/*
* There are only two callers of this function.
*
* One in create_subvol() in the ioctl context, which needs to hold
* the qgroup_ioctl_lock.
*
* The other one in create_pending_snapshot() where no other qgroup
* code can modify the fs as they all need to either start a new trans
* or hold a trans handler, thus we don't need to hold
* qgroup_ioctl_lock.
* This would avoid long and complex lock chain and make lockdep happy.
*/
spin_lock(&fs_info->trans_lock);
if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
committing = true;
spin_unlock(&fs_info->trans_lock);
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
goto out;
@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
unlock:
spin_unlock(&fs_info->qgroup_lock);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}

View File

@ -7,7 +7,7 @@
#ifndef BTRFS_RAID56_H
#define BTRFS_RAID56_H
static inline int nr_parity_stripes(struct map_lookup *map)
static inline int nr_parity_stripes(const struct map_lookup *map)
{
if (map->type & BTRFS_BLOCK_GROUP_RAID5)
return 1;
@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map)
return 0;
}
static inline int nr_data_stripes(struct map_lookup *map)
static inline int nr_data_stripes(const struct map_lookup *map)
{
return map->num_stripes - nr_parity_stripes(map);
}

View File

@ -20,6 +20,7 @@
#include "inode-map.h"
#include "qgroup.h"
#include "print-tree.h"
#include "delalloc-space.h"
/*
* backref_node, mapping_node and tree_block start with this

View File

@ -9,6 +9,8 @@
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
spin_unlock(&root->root_item_lock);
}
/*
* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
* use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
* common file/directory operations, they change two fs/file trees
* and root tree, the number of items that the qgroup reserves is
* different with the free space reservation. So we can not use
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true);
if (ret)
return ret;
}
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
rsv->space_info = btrfs_find_space_info(fs_info,
BTRFS_BLOCK_GROUP_METADATA);
ret = btrfs_block_rsv_add(root, rsv, num_bytes,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
return ret;
}
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv)
{
btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
}

View File

@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "ctree.h"
#include "volumes.h"
#include "disk-io.h"
@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock)
static int scrub_checksum_data(struct scrub_block *sblock)
{
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 csum[BTRFS_CSUM_SIZE];
u8 *on_disk_csum;
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
u64 len;
int index;
@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
if (!sblock->pagev[0]->have_csum)
return 0;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
on_disk_csum = sblock->pagev[0]->csum;
page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, PAGE_SIZE);
crc = btrfs_csum_data(buffer, crc, l);
crypto_shash_update(shash, buffer, l);
kunmap_atomic(buffer);
len -= l;
if (len == 0)
@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock)
buffer = kmap_atomic(page);
}
btrfs_csum_final(crc, csum);
crypto_shash_final(shash, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
sblock->checksum_error = 1;
@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_header *h;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
u64 len;
int index;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
crc = btrfs_csum_data(p, crc, l);
crypto_shash_update(shash, p, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
sblock->checksum_error = 1;
@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock)
{
struct btrfs_super_block *s;
struct scrub_ctx *sctx = sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 calculated_csum[BTRFS_CSUM_SIZE];
u8 on_disk_csum[BTRFS_CSUM_SIZE];
struct page *page;
void *mapped_buffer;
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
int fail_gen = 0;
int fail_cor = 0;
u64 len;
int index;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
BUG_ON(sblock->page_count < 1);
page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
for (;;) {
u64 l = min_t(u64, len, mapped_size);
crc = btrfs_csum_data(p, crc, l);
crypto_shash_update(shash, p, l);
kunmap_atomic(mapped_buffer);
len -= l;
if (len == 0)
@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
p = mapped_buffer;
}
btrfs_csum_final(crc, calculated_csum);
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
++fail_cor;
@ -2448,7 +2460,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
ASSERT(index < UINT_MAX);
num_sectors = sum->len / sctx->fs_info->sectorsize;
memcpy(csum, sum->sums + index, sctx->csum_size);
memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
if (index == num_sectors - 1) {
list_del(&sum->list);
kfree(sum);
@ -2660,18 +2672,18 @@ static int get_raid56_logic_offset(u64 physical, int num,
u64 last_offset;
u32 stripe_index;
u32 rot;
const int data_stripes = nr_data_stripes(map);
last_offset = (physical - map->stripes[num].physical) *
nr_data_stripes(map);
last_offset = (physical - map->stripes[num].physical) * data_stripes;
if (stripe_start)
*stripe_start = last_offset;
*offset = last_offset;
for (i = 0; i < nr_data_stripes(map); i++) {
for (i = 0; i < data_stripes; i++) {
*offset = last_offset + i * map->stripe_len;
stripe_nr = div64_u64(*offset, map->stripe_len);
stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
stripe_nr = div_u64(stripe_nr, data_stripes);
/* Work out the disk rotation on this stripe-set */
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
@ -3079,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
offset = map->stripe_len * (num / map->sub_stripes);
increment = map->stripe_len * factor;
mirror_num = num % map->sub_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
@ -3410,15 +3422,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
int i;
int ret = 0;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, chunk_offset, 1);
read_unlock(&map_tree->lock);
if (!em) {
/*

View File

@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
if (ret)
goto out;
mutex_lock(&fs_info->balance_mutex);
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
btrfs_warn_rl(fs_info,
"cannot run send because a balance operation is in progress");
ret = -EAGAIN;
goto out;
}
fs_info->send_in_progress++;
mutex_unlock(&fs_info->balance_mutex);
current->journal_info = BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
current->journal_info = NULL;
mutex_lock(&fs_info->balance_mutex);
fs_info->send_in_progress--;
mutex_unlock(&fs_info->balance_mutex);
if (ret < 0)
goto out;

1094
fs/btrfs/space-info.c Normal file

File diff suppressed because it is too large Load Diff

133
fs/btrfs/space-info.h Normal file
View File

@ -0,0 +1,133 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
struct btrfs_space_info {
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
this doesn't take mirrors into account */
u64 bytes_used; /* total bytes used,
this doesn't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 max_extent_size; /* This will hold the maximum extent size of
the space info if we had an ENOSPC in the
allocator. */
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
unsigned int flush:1; /* set if we are trying to make space */
unsigned int force_alloc; /* set if we need to force a chunk
alloc for this space */
u64 disk_used; /* total bytes used on disk */
u64 disk_total; /* total bytes on disk, takes mirrors into
account */
u64 flags;
/*
* bytes_pinned is kept in line with what is actually pinned, as in
* we've called update_block_group and dropped the bytes_used counter
* and increased the bytes_pinned counter. However this means that
* bytes_pinned does not reflect the bytes that will be pinned once the
* delayed refs are flushed, so this counter is inc'ed every time we
* call btrfs_free_extent so it is a realtime count of what will be
* freed once the transaction is committed. It will be zeroed every
* time the transaction commits.
*/
struct percpu_counter total_bytes_pinned;
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
/*
* tickets_id just indicates the next ticket will be handled, so note
* it's not stored per ticket.
*/
u64 tickets_id;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
wait_queue_head_t wait;
struct kobject kobj;
struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
struct reserve_ticket {
u64 orig_bytes;
u64 bytes;
int error;
struct list_head list;
wait_queue_head_t wait;
};
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
}
/*
*
* Declare a helper function to detect underflow of various space info members
*/
#define DECLARE_SPACE_INFO_UPDATE(name) \
static inline void \
btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
struct btrfs_space_info *sinfo, \
s64 bytes) \
{ \
lockdep_assert_held(&sinfo->lock); \
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
if (bytes < 0 && sinfo->name < -bytes) { \
WARN_ON(1); \
sinfo->name = 0; \
return; \
} \
sinfo->name += bytes; \
}
DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
u64 bytes_readonly,
struct btrfs_space_info **space_info);
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
u64 flags);
u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
bool may_use_included);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 orig_bytes,
enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */

View File

@ -42,6 +42,7 @@
#include "dev-replace.h"
#include "free-space-cache.h"
#include "backref.h"
#include "space-info.h"
#include "tests/btrfs-tests.h"
#include "qgroup.h"
@ -1553,6 +1554,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
if (!strstr(crc32c_impl(), "generic"))
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
@ -1601,14 +1604,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
{
struct vfsmount *mnt_root;
struct dentry *root;
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
int error = 0;
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
error = btrfs_parse_subvol_options(data, &subvol_name,
&subvol_objectid);
if (error) {
@ -1904,8 +1903,9 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
u64 type;
u64 avail_space;
u64 min_stripe_size;
int min_stripes = 1, num_stripes = 1;
int min_stripes, num_stripes = 1;
int i = 0, nr_devices;
const struct btrfs_raid_attr *rattr;
/*
* We aren't under the device list lock, so this is racy-ish, but good
@ -1929,21 +1929,18 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
/* calc min stripe number for data space allocation */
type = btrfs_data_alloc_profile(fs_info);
if (type & BTRFS_BLOCK_GROUP_RAID0) {
min_stripes = 2;
num_stripes = nr_devices;
} else if (type & BTRFS_BLOCK_GROUP_RAID1) {
min_stripes = 2;
num_stripes = 2;
} else if (type & BTRFS_BLOCK_GROUP_RAID10) {
min_stripes = 4;
num_stripes = 4;
}
rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
min_stripes = rattr->devs_min;
if (type & BTRFS_BLOCK_GROUP_DUP)
min_stripe_size = 2 * BTRFS_STRIPE_LEN;
else
min_stripe_size = BTRFS_STRIPE_LEN;
if (type & BTRFS_BLOCK_GROUP_RAID0)
num_stripes = nr_devices;
else if (type & BTRFS_BLOCK_GROUP_RAID1)
num_stripes = 2;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = 4;
/* Adjust for more than 1 stripe per device */
min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
@ -2466,3 +2463,4 @@ late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
MODULE_SOFTDEP("pre: crc32c");

View File

@ -16,6 +16,7 @@
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
#include "space-info.h"
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);

View File

@ -10,6 +10,7 @@
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../extent_io.h"
#include "../btrfs_inode.h"
#define PROCESS_UNLOCK (1 << 0)
#define PROCESS_RELEASE (1 << 1)
@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
static int test_find_delalloc(u32 sectorsize)
{
struct inode *inode;
struct extent_io_tree tmp;
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize)
test_std_err(TEST_ALLOC_INODE);
return -ENOMEM;
}
tmp = &BTRFS_I(inode)->io_tree;
/*
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL);
extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
/*
* First go through and create and mark all of our pages dirty, we pin
@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize)
* |--- delalloc ---|
* |--- search ---|
*/
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("should have found at least one delalloc");
@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize)
sectorsize - 1, start, end);
goto out_bits;
}
unlock_extent(&tmp, start, end);
unlock_extent(tmp, start, end);
unlock_page(locked_page);
put_page(locked_page);
@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize)
test_err("couldn't find the locked page");
goto out_bits;
}
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("couldn't find delalloc in our range");
@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
unlock_extent(&tmp, start, end);
unlock_extent(tmp, start, end);
/* locked_page was unlocked above */
put_page(locked_page);
@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (found) {
test_err("found range when we shouldn't have");
@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize)
*
* We are re-using our test_start from above since it works out well.
*/
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("didn't find our range");
@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize)
test_err("pages in range were not all locked");
goto out_bits;
}
unlock_extent(&tmp, start, end);
unlock_extent(tmp, start, end);
/*
* Now to test where we run into a page that is no longer dirty in the
@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
found = find_lock_delalloc_range(inode, locked_page, &start,
&end);
if (!found) {
test_err("didn't find our range");
@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize)
}
ret = 0;
out_bits:
clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1);
clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
out:
if (locked_page)
put_page(locked_page);
@ -432,6 +434,89 @@ out:
return ret;
}
static int test_find_first_clear_extent_bit(void)
{
struct extent_io_tree tree;
u64 start, end;
test_msg("running find_first_clear_extent_bit test");
extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
/*
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
*/
set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != 0 || end != SZ_1M -1)
test_err("error finding beginning range: start %llu end %llu",
start, end);
/* Now add 32M-64M so that we have a hole between 4M-32M */
set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
/*
* Request first hole starting at 12M, we should get 4M-32M
*/
find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1)
test_err("error finding trimmed range: start %llu end %llu",
start, end);
/*
* Search in the middle of allocated range, should get the next one
* available, which happens to be unallocated -> 4M-32M
*/
find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M -1)
test_err("error finding next unalloc range: start %llu end %llu",
start, end);
/*
* Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
* being unset in this range, we should get the entry in range 64M-72M
*/
set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
CHUNK_TRIMMED);
if (start != SZ_64M || end != SZ_64M + SZ_8M - 1)
test_err("error finding exact range: start %llu end %llu",
start, end);
find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
CHUNK_TRIMMED);
/*
* Search in the middle of set range whose immediate neighbour doesn't
* have the bits set so it must be returned
*/
if (start != SZ_64M || end != SZ_64M + SZ_8M - 1)
test_err("error finding next alloc range: start %llu end %llu",
start, end);
/*
* Search beyond any known range, shall return after last known range
* and end should be -1
*/
find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
if (start != SZ_64M + SZ_8M || end != -1)
test_err(
"error handling beyond end of range search: start %llu end %llu",
start, end);
return 0;
}
int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
@ -442,6 +527,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
if (ret)
goto out;
ret = test_find_first_clear_extent_bit();
if (ret)
goto out;
ret = test_eb_bitmaps(sectorsize, nodesize);
out:
return ret;

View File

@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
goto out;
@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->len = SZ_4K;
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
goto out;
@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->len = len;
em->block_start = start;
em->block_len = len;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
goto out;
@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->len = SZ_4K;
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
goto out;
@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->len = SZ_1K;
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case2 [0 1K]: ret %d", ret);
goto out;
@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->len = SZ_4K;
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
goto out;
@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case3 [0x%llx 0x%llx): ret %d",
start, start + len, ret);
@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->len = SZ_8K;
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
goto out;
@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->len = 24 * SZ_1K;
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
goto out;
@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->len = SZ_32K;
em->block_start = 0;
em->block_len = SZ_32K;
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
test_err("case4 [0x%llx 0x%llx): ret %d",
start, len, ret);

View File

@ -128,6 +128,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
return atomic_read(&trans->num_extwriters);
}
/*
* To be called after all the new block groups attached to the transaction
* handle have been created (btrfs_create_pending_block_groups()).
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
if (!trans->chunk_bytes_reserved)
return;
WARN_ON_ONCE(!list_empty(&trans->new_bgs));
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved = 0;
}
/*
* either allocate a new transaction or hop into the existing one
*/

View File

@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
#endif

View File

@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
struct btrfs_file_extent_item *fi;
u32 sectorsize = fs_info->sectorsize;
u32 item_size = btrfs_item_size_nr(leaf, slot);
u64 extent_end;
if (!IS_ALIGNED(key->offset, sectorsize)) {
file_extent_err(leaf, slot,
@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf,
CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize))
return -EUCLEAN;
/* Catch extent end overflow */
if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi),
key->offset, &extent_end)) {
file_extent_err(leaf, slot,
"extent end overflow, have file offset %llu extent num bytes %llu",
key->offset,
btrfs_file_extent_num_bytes(leaf, fi));
return -EUCLEAN;
}
/*
* Check that no two consecutive file extent items, in the same leaf,
* present ranges that overlap each other.

View File

@ -3322,6 +3322,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
return 0;
}
/*
* Check if an inode was logged in the current transaction. We can't always rely
* on an inode's logged_trans value, because it's an in-memory only field and
* therefore not persisted. This means that its value is lost if the inode gets
* evicted and loaded again from disk (in which case it has a value of 0, and
* certainly it is smaller then any possible transaction ID), when that happens
* the full_sync flag is set in the inode's runtime flags, so on that case we
* assume eviction happened and ignore the logged_trans value, assuming the
* worst case, that the inode was logged before in the current transaction.
*/
static bool inode_logged(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
if (inode->logged_trans == trans->transid)
return true;
if (inode->last_trans == trans->transid &&
test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
return true;
return false;
}
/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
if (dir->logged_trans < trans->transid)
if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
if (inode->logged_trans < trans->transid)
if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
@ -5420,9 +5444,19 @@ log_extents:
}
}
/*
* Don't update last_log_commit if we logged that an inode exists after
* it was loaded to memory (full_sync bit set).
* This is to prevent data loss when we do a write to the inode, then
* the inode gets evicted after all delalloc was flushed, then we log
* it exists (due to a rename for example) and then fsync it. This last
* fsync would do nothing (not logging the extents previously written).
*/
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
inode->last_log_commit = inode->last_sub_trans;
if (inode_only != LOG_INODE_EXISTS ||
!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
mutex_unlock(&inode->log_mutex);

View File

@ -28,6 +28,7 @@
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
#include "space-info.h"
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
@ -123,12 +124,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
const char *get_raid_name(enum btrfs_raid_types type)
const char *btrfs_bg_type_to_raid_name(u64 flags)
{
if (type >= BTRFS_NR_RAID_TYPES)
const int index = btrfs_bg_flags_to_raid_index(flags);
if (index >= BTRFS_NR_RAID_TYPES)
return NULL;
return btrfs_raid_array[type].raid_name;
return btrfs_raid_array[index].raid_name;
}
/*
@ -237,7 +240,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* chunk_mutex
* -----------
* protects chunks, adding or removing during allocation, trim or when a new
* device is added/removed
* device is added/removed. Additionally it also protects post_commit_list of
* individual devices, since they can be added to the transaction's
* post_commit_list only with chunk_mutex held.
*
* cleaner_mutex
* -------------
@ -1818,7 +1823,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
struct rb_node *n;
u64 ret = 0;
em_tree = &fs_info->mapping_tree.map_tree;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map.rb_root);
if (n) {
@ -2941,7 +2946,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree;
struct extent_map *em;
em_tree = &fs_info->mapping_tree.map_tree;
em_tree = &fs_info->mapping_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, length);
read_unlock(&em_tree->lock);
@ -3474,6 +3479,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
return 1;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
{
const int index = btrfs_bg_flags_to_raid_index(type);
const int ncopies = btrfs_raid_array[index].ncopies;
const int nparity = btrfs_raid_array[index].nparity;
if (nparity)
return num_stripes - nparity;
else
return num_stripes / ncopies;
}
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
@ -3483,22 +3500,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
u64 type;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
factor = num_stripes / 2;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
factor = num_stripes - 1;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
factor = num_stripes - 2;
} else {
factor = num_stripes;
}
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
@ -3921,11 +3931,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
bp += ret; \
} while (0)
if (flags & BTRFS_BALANCE_ARGS_CONVERT) {
int index = btrfs_bg_flags_to_raid_index(bargs->target);
CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index));
}
if (flags & BTRFS_BALANCE_ARGS_CONVERT)
CHECK_APPEND_1ARG("convert=%s,",
btrfs_bg_type_to_raid_name(bargs->target));
if (flags & BTRFS_BALANCE_ARGS_SOFT)
CHECK_APPEND_NOARG("soft,");
@ -4047,6 +4055,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
u64 num_devices;
unsigned seq;
bool reducing_integrity;
int i;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
@ -4076,48 +4085,43 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
num_devices = btrfs_num_devices(fs_info);
allowed = 0;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
if (num_devices >= btrfs_raid_array[i].devs_min)
allowed |= btrfs_raid_array[i].bg_flag;
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
if (num_devices > 2)
allowed |= BTRFS_BLOCK_GROUP_RAID5;
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
if (validate_convert_profile(&bctl->data, allowed)) {
int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
btrfs_err(fs_info,
"balance: invalid convert data profile %s",
get_raid_name(index));
btrfs_bg_type_to_raid_name(bctl->data.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->meta, allowed)) {
int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
btrfs_err(fs_info,
"balance: invalid convert metadata profile %s",
get_raid_name(index));
btrfs_bg_type_to_raid_name(bctl->meta.target));
ret = -EINVAL;
goto out;
}
if (validate_convert_profile(&bctl->sys, allowed)) {
int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
btrfs_err(fs_info,
"balance: invalid convert system profile %s",
get_raid_name(index));
btrfs_bg_type_to_raid_name(bctl->sys.target));
ret = -EINVAL;
goto out;
}
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6;
/*
* Allow to reduce metadata or system integrity only if force set for
* profiles with redundancy (copies, parity)
*/
allowed = 0;
for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
if (btrfs_raid_array[i].ncopies >= 2 ||
btrfs_raid_array[i].tolerated_failures >= 1)
allowed |= btrfs_raid_array[i].bg_flag;
}
do {
seq = read_seqbegin(&fs_info->profiles_lock);
@ -4152,12 +4156,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
int data_index = btrfs_bg_flags_to_raid_index(data_target);
btrfs_warn(fs_info,
"balance: metadata profile %s has lower redundancy than data profile %s",
get_raid_name(meta_index), get_raid_name(data_index));
btrfs_bg_type_to_raid_name(meta_target),
btrfs_bg_type_to_raid_name(data_target));
}
if (fs_info->send_in_progress) {
btrfs_warn_rl(fs_info,
"cannot run balance while send operations are in progress (%d in progress)",
fs_info->send_in_progress);
ret = -EAGAIN;
goto out;
}
ret = insert_balance_item(fs_info, bctl);
@ -4949,6 +4959,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
devs_max = btrfs_raid_array[index].devs_max;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info);
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
@ -4957,8 +4969,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = SZ_1G;
max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@ -4966,13 +4976,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
@ -5143,7 +5149,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->block_len = em->len;
em->orig_block_len = stripe_size;
em_tree = &info->mapping_tree.map_tree;
em_tree = &info->mapping_tree;
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em, 0);
if (ret) {
@ -5324,20 +5330,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
int max_errors;
const int index = btrfs_bg_flags_to_raid_index(map->type);
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_DUP)) {
max_errors = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
max_errors = 2;
} else {
max_errors = 0;
}
return max_errors;
return btrfs_raid_array[index].tolerated_failures;
}
int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
@ -5378,21 +5373,16 @@ end:
return readonly;
}
void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
extent_map_tree_init(&tree->map_tree);
}
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
void btrfs_mapping_tree_free(struct extent_map_tree *tree)
{
struct extent_map *em;
while (1) {
write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
write_unlock(&tree->map_tree.lock);
remove_extent_mapping(tree, em);
write_unlock(&tree->lock);
if (!em)
break;
/* once for us */
@ -5419,7 +5409,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
@ -5493,7 +5483,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev;
ASSERT((map->type &
(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
(BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
@ -5682,7 +5672,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
&remaining_stripes);
div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
last_stripe *= sub_stripes;
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
BTRFS_BLOCK_GROUP_DUP)) {
num_stripes = map->num_stripes;
} else {
@ -5926,6 +5916,102 @@ static bool need_full_stripe(enum btrfs_map_op op)
return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
}
/*
* btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
* tuple. This information is used to calculate how big a
* particular bio can get before it straddles a stripe.
*
* @fs_info - the filesystem
* @logical - address that we want to figure out the geometry of
* @len - the length of IO we are going to perform, starting at @logical
* @op - type of operation - write or read
* @io_geom - pointer used to return values
*
* Returns < 0 in case a chunk for the given logical address cannot be found,
* usually shouldn't happen unless @logical is corrupted, 0 otherwise.
*/
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
{
struct extent_map *em;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
ASSERT(op != BTRFS_MAP_DISCARD);
em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
return PTR_ERR(em);
map = em->map_lookup;
/* Offset of this logical address in the chunk */
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
/* Stripe wher this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
if (offset < stripe_offset) {
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
free_extent_map(em);
return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len = stripe_len - stripe_offset;
/*
* In case of raid56, we need to know the stripe aligned start
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * data_stripes;
raid56_full_stripe_start = offset;
/*
* Allow a write of a full stripe, but make sure we
* don't allow straddling of stripes
*/
raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
/*
* For writes to RAID[56], allow a full stripeset across
* all disks. For other RAID types and for RAID[56]
* reads, just allow a single stripe (on a single disk).
*/
if (op == BTRFS_MAP_WRITE) {
max_len = stripe_len * data_stripes -
(offset - raid56_full_stripe_start);
}
}
len = min_t(u64, em->len - offset, max_len);
} else {
len = em->len - offset;
}
io_geom->len = len;
io_geom->offset = offset;
io_geom->stripe_len = stripe_len;
io_geom->stripe_nr = stripe_nr;
io_geom->stripe_offset = stripe_offset;
io_geom->raid56_stripe_offset = raid56_full_stripe_start;
return 0;
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
enum btrfs_map_op op,
u64 logical, u64 *length,
@ -5939,6 +6025,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
u64 stripe_nr;
u64 stripe_len;
u32 stripe_index;
int data_stripes;
int i;
int ret = 0;
int num_stripes;
@ -5951,76 +6038,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
struct btrfs_io_geometry geom;
ASSERT(bbio_ret);
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
*length, bbio_ret);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
return ret;
em = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(em))
return PTR_ERR(em);
ASSERT(em);
map = em->map_lookup;
offset = logical - em->start;
stripe_len = map->stripe_len;
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
if (offset < stripe_offset) {
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
stripe_offset, offset, em->start, logical,
stripe_len);
free_extent_map(em);
return -EINVAL;
}
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
/* if we're here for raid56, we need to know the stripe aligned start */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
raid56_full_stripe_start = offset;
/* allow a write of a full stripe, but make sure we don't
* allow straddling of stripes
*/
raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
}
if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
(op == BTRFS_MAP_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
} else {
/* we limit the length of each bio to what fits in a stripe */
max_len = stripe_len - stripe_offset;
}
*length = min_t(u64, em->len - offset, max_len);
} else {
*length = em->len - offset;
}
/*
* This is for when we're called from btrfs_bio_fits_in_stripe and all
* it cares about is the length
*/
if (!bbio_ret)
goto out;
*length = geom.len;
offset = geom.offset;
stripe_len = geom.stripe_len;
stripe_nr = geom.stripe_nr;
stripe_offset = geom.stripe_offset;
raid56_full_stripe_start = geom.raid56_stripe_offset;
data_stripes = nr_data_stripes(map);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@ -6052,7 +6092,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
&stripe_index);
if (!need_full_stripe(op))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (need_full_stripe(op))
num_stripes = map->num_stripes;
else if (mirror_num)
@ -6094,7 +6134,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div64_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
stripe_len * data_stripes);
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
@ -6110,10 +6150,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* Mirror #3 is RAID6 Q block.
*/
stripe_nr = div_u64_rem(stripe_nr,
nr_data_stripes(map), &stripe_index);
data_stripes, &stripe_index);
if (mirror_num > 1)
stripe_index = nr_data_stripes(map) +
mirror_num - 2;
stripe_index = data_stripes + mirror_num - 2;
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
@ -6171,8 +6210,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * nr_data_stripes(map);
for (i = 0; i < nr_data_stripes(map); i++)
tmp = stripe_nr * data_stripes;
for (i = 0; i < data_stripes; i++)
bbio->raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
@ -6687,7 +6726,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
@ -6712,9 +6751,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
read_unlock(&map_tree->map_tree.lock);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, logical, 1);
read_unlock(&map_tree->lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
@ -6783,9 +6822,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
}
write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em, 0);
write_unlock(&map_tree->map_tree.lock);
write_lock(&map_tree->lock);
ret = add_extent_mapping(map_tree, em, 0);
write_unlock(&map_tree->lock);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
@ -7103,14 +7142,14 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
u64 next_start = 0;
bool ret = true;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
read_unlock(&map_tree->map_tree.lock);
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, 0, (u64)-1);
read_unlock(&map_tree->lock);
/* No chunk at all? Return false anyway */
if (!em) {
ret = false;
@ -7148,10 +7187,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
next_start = extent_map_end(em);
free_extent_map(em);
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, next_start,
read_lock(&map_tree->lock);
em = lookup_extent_mapping(map_tree, next_start,
(u64)(-1) - next_start);
read_unlock(&map_tree->map_tree.lock);
read_unlock(&map_tree->lock);
}
out:
return ret;
@ -7600,10 +7639,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
*/
int btrfs_bg_type_to_factor(u64 flags)
{
if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
return 2;
return 1;
const int index = btrfs_bg_flags_to_raid_index(flags);
return btrfs_raid_array[index].ncopies;
}
@ -7612,7 +7650,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 chunk_offset, u64 devid,
u64 physical_offset, u64 physical_len)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
@ -7701,7 +7739,7 @@ out:
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct rb_node *node;
int ret = 0;

View File

@ -23,6 +23,21 @@ struct btrfs_pending_bios {
struct bio *tail;
};
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
u64 stripe_len;
/* number of stripe where address falls */
u64 stripe_nr;
/* offset of address in stripe */
u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
/*
* Use sequence counter to get consistent device stat data on
* 32-bit processors.
@ -43,8 +58,8 @@ struct btrfs_pending_bios {
#define BTRFS_DEV_STATE_FLUSH_SENT (4)
struct btrfs_device {
struct list_head dev_list;
struct list_head dev_alloc_list;
struct list_head dev_list; /* device_list_mutex */
struct list_head dev_alloc_list; /* chunk mutex */
struct list_head post_commit_list; /* chunk mutex */
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
@ -229,9 +244,14 @@ struct btrfs_fs_devices {
* this mutex lock.
*/
struct mutex device_list_mutex;
/* List of all devices, protected by device_list_mutex */
struct list_head devices;
/* devices not currently being allocated */
/*
* Devices which can satisfy space allocation. Protected by
* chunk_mutex
*/
struct list_head alloc_list;
struct btrfs_fs_devices *seed;
@ -336,16 +356,16 @@ struct btrfs_device_info {
};
struct btrfs_raid_attr {
int sub_stripes; /* sub_stripes info for map */
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
int nparity; /* number of stripes worth of bytes to store
u8 sub_stripes; /* sub_stripes info for map */
u8 dev_stripes; /* stripes per dev */
u8 devs_max; /* max devs to use */
u8 devs_min; /* min devs needed */
u8 tolerated_failures; /* max tolerated fail devs */
u8 devs_increment; /* ndevs has to be a multiple of this */
u8 ncopies; /* how many copies to data has */
u8 nparity; /* number of stripes worth of bytes to store
* parity information */
int mindev_error; /* error code if min devs requisite is unmet */
u8 mindev_error; /* error code if min devs requisite is unmet */
const char raid_name[8]; /* name of the raid */
u64 bg_flag; /* block group flag of the raid */
};
@ -408,13 +428,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret);
int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
u64 logical, u64 len, struct btrfs_io_geometry *io_geom);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct extent_map_tree *tree);
blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, int async_submit);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
@ -557,8 +578,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
const char *get_raid_name(enum btrfs_raid_types type);
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head *btrfs_get_fs_uuids(void);
@ -568,6 +587,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
#endif

View File

@ -29,6 +29,7 @@ struct btrfs_qgroup_extent_record;
struct btrfs_qgroup;
struct extent_io_tree;
struct prelim_ref;
struct btrfs_space_info;
TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR);
TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS);
@ -2091,6 +2092,45 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);
DECLARE_EVENT_CLASS(btrfs__space_info_update,
TP_PROTO(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, u64 old, s64 diff),
TP_ARGS(fs_info, sinfo, old, diff),
TP_STRUCT__entry_btrfs(
__field( u64, type )
__field( u64, old )
__field( s64, diff )
),
TP_fast_assign_btrfs(fs_info,
__entry->type = sinfo->flags;
__entry->old = old;
__entry->diff = diff;
),
TP_printk_btrfs("type=%s old=%llu diff=%lld",
__print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS),
__entry->old, __entry->diff)
);
DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use,
TP_PROTO(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, u64 old, s64 diff),
TP_ARGS(fs_info, sinfo, old, diff)
);
DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned,
TP_PROTO(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *sinfo, u64 old, s64 diff),
TP_ARGS(fs_info, sinfo, old, diff)
);
#endif /* _TRACE_BTRFS_H */
/* This part must be outside protection */

View File

@ -866,6 +866,8 @@ enum btrfs_raid_types {
#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
BTRFS_BLOCK_GROUP_RAID6)
#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1)
/*
* We need a bit for restriper to be able to tell when chunks of type
* SINGLE are available. This "extended" profile format is used in