for-6.13-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmc0zT4ACgkQxWXV+ddt
 WDtThRAAhzSSiHcJqTfCL5nHh7w85MNEVw28o1ETgXSYJmx0JOWLE7Znlp2FV7jj
 IbYkFfF2gXJzYvRZkcXB/TAHV9KJG5yZIBZfccbM+9db9f8xkImVKMuqQRXPU41R
 ppSCmqZTeujtt8ucsaJkMpm6pzECKJCJaGOsMJ8fiqKpo89dKO3eGAVboSbpPF4C
 r0YmppiBwSP/cCXQCqWxZRbqPGN+lUgZpIGNRi157kehfmRHlVVJTO1pgqK8PCXb
 uIT09Kulppfez8+1A10CPcniDTyinLik/qLTNlzdWoDBL4iNJMg0A0wsA04AJVf0
 PdOS0REusiv3QcEIO6PefuRFRRfXcSLPpPDUceltJT5O0uM2gUqf2C7dEHXUGU3o
 TdgYlbQpsJWpZ7VGWQDZeGGV04lOPQvu0LGLPgEerUQd5H9ABa0dX8Fn0sPhKsa8
 whpAcdfE4rdNxB2OJFnqQeFq0z3cSjP/rvKlluCmAj97QYI+kiu3QyhemcT1YSC9
 U7n5Ya9IzIYCN3ml54q3hEgyD0IVGGG20GuUmqC9XSP9mrQRC8I1g7v26AiOTrrk
 VhgSdtMmphDxXudifsnYMaQ0Z1QqiUrW1SM/prAEOnBYCo75+HDsTgrq9ithgHoI
 4xz4YXJyMRs18qfTJctXC1wmGuz5plTdQrwarHdNsELN5HEyqX4=
 =aAcf
 -----END PGP SIGNATURE-----

Merge tag 'for-6.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "Changes outside of btrfs: add io_uring command flag to track a dying
  task (the rest will go via the block git tree).

  User visible changes:

   - wire encoded read (ioctl) to io_uring commands, this can be used on
     itself, in the future this will allow 'send' to be asynchronous. As
     a consequence, the encoded read ioctl can also work in non-blocking
     mode

   - new ioctl to wait for cleaned subvolumes, no need to use the
     generic and root-only SEARCH_TREE ioctl, will be used by "btrfs
     subvol sync"

   - recognize different paths/symlinks for the same devices and don't
     report them during rescanning, this can be observed with LVM or DM

   - seeding device use case change, the sprout device (the one
     capturing new writes) will not clear the read-only status of the
     super block; this prevents accumulating space from deleted
     snapshots

  Performance improvements:

   - reduce lock contention when traversing extent buffers

   - reduce extent tree lock contention when searching for inline
     backref

   - switch from rb-trees to xarray for delayed ref tracking,
     improvements due to better cache locality, branching factors and
     more compact data structures

   - enable extent map shrinker again (prevent memory exhaustion under
     some types of IO load), reworked to run in a single worker thread
     (there used to be problems causing long stalls under memory
     pressure)

  Core changes:

   - raid-stripe-tree feature updates:
       - make device replace and scrub work
       - implement partial deletion of stripe extents
       - new selftests

   - split the config option BTRFS_DEBUG and add EXPERIMENTAL for
     features that are experimental or with known problems so we don't
     misuse debugging config for that

   - subpage mode updates (sector < page):
       - update compression implementations
       - update writepage, writeback

   - continued folio API conversions:
       - buffered writes

   - make buffered write copy one page at a time, preparatory work for
     future integration with large folios, may cause performance drop

   - proper locking of root item regarding starting send

   - error handling improvements

   - code cleanups and refactoring:
       - dead code removal
       - unused parameter reduction
       - lockdep assertions"

* tag 'for-6.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (119 commits)
  btrfs: send: check for read-only send root under critical section
  btrfs: send: check for dead send root under critical section
  btrfs: remove check for NULL fs_info at btrfs_folio_end_lock_bitmap()
  btrfs: fix warning on PTR_ERR() against NULL device at btrfs_control_ioctl()
  btrfs: fix a typo in btrfs_use_zone_append
  btrfs: avoid superfluous calls to free_extent_map() in btrfs_encoded_read()
  btrfs: simplify logic to decrement snapshot counter at btrfs_mksnapshot()
  btrfs: remove hole from struct btrfs_delayed_node
  btrfs: update stale comment for struct btrfs_delayed_ref_node::add_list
  btrfs: add new ioctl to wait for cleaned subvolumes
  btrfs: simplify range tracking in cow_file_range()
  btrfs: remove conditional path allocation in btrfs_read_locked_inode()
  btrfs: push cleanup into btrfs_read_locked_inode()
  io_uring/cmd: let cmds to know about dying task
  btrfs: add struct io_btrfs_cmd as type for io_uring_cmd_to_pdu()
  btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)
  btrfs: move priv off stack in btrfs_encoded_read_regular_fill_pages()
  btrfs: don't sleep in btrfs_encoded_read() if IOCB_NOWAIT is set
  btrfs: change btrfs_encoded_read() so that reading of extent is done by caller
  btrfs: remove pointless iocb::ki_pos addition in btrfs_encoded_read()
  ...
This commit is contained in:
Linus Torvalds 2024-11-18 16:37:41 -08:00
commit c14a8a4c04
67 changed files with 2470 additions and 1432 deletions

View File

@ -78,6 +78,32 @@ config BTRFS_ASSERT
If unsure, say N. If unsure, say N.
config BTRFS_EXPERIMENTAL
bool "Btrfs experimental features"
depends on BTRFS_FS
default n
help
Enable experimental features. These features may not be stable enough
for end users. This is meant for btrfs developers or users who wish
to test the functionality and report problems.
Current list:
- extent map shrinker - performance problems with too frequent shrinks
- send stream protocol v3 - fs-verity support
- checksum offload mode - sysfs knob to affect when checksums are
calculated (at IO time, or in a thread)
- raid-stripe-tree - additional mapping of extents to devices to
support RAID1* profiles on zoned devices,
RAID56 not yet supported
- extent tree v2 - complex rework of extent tracking
If unsure, say N.
config BTRFS_FS_REF_VERIFY config BTRFS_FS_REF_VERIFY
bool "Btrfs with the ref verify tool compiled in" bool "Btrfs with the ref verify tool compiled in"
depends on BTRFS_FS depends on BTRFS_FS

View File

@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
tests/free-space-tree-tests.o tests/extent-map-tests.o tests/free-space-tree-tests.o tests/extent-map-tests.o \
tests/raid-stripe-tree-tests.o

View File

@ -1442,7 +1442,8 @@ again:
*/ */
delayed_refs = &ctx->trans->transaction->delayed_refs; delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs,
ctx->bytenr);
if (head) { if (head) {
if (!mutex_trylock(&head->mutex)) { if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs); refcount_inc(&head->refs);

View File

@ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
{ {
bool auto_csum_mode = true; bool auto_csum_mode = true;
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);

View File

@ -2797,7 +2797,7 @@ next:
* uncompressed data size, because the compression is only done * uncompressed data size, because the compression is only done
* when writeback triggered and we don't know how much space we * when writeback triggered and we don't know how much space we
* are actually going to need, so we reserve the uncompressed * are actually going to need, so we reserve the uncompressed
* size because the data may be uncompressible in the worst case. * size because the data may be incompressible in the worst case.
*/ */
if (ret == 0) { if (ret == 0) {
bool used; bool used;

View File

@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
struct extent_state *other); struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode, void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split); struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
void btrfs_evict_inode(struct inode *inode); void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb); struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode); void btrfs_destroy_inode(struct inode *inode);
@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type); int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr, u64 disk_bytenr, u64 disk_io_size,
u64 disk_io_size, struct page **pages, void *uring_ctx);
struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded); struct btrfs_ioctl_encoded_io_args *encoded,
struct extent_state **cached_state,
u64 *disk_bytenr, u64 *disk_io_size);
ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
u64 start, u64 lockend,
struct extent_state **cached_state,
u64 disk_bytenr, u64 disk_io_size,
size_t count, bool compressed, bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded); const struct btrfs_ioctl_encoded_io_args *encoded);

View File

@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index) if (pg_index > end_index)
break; break;
folio = __filemap_get_folio(mapping, pg_index, 0, 0); folio = filemap_get_folio(mapping, pg_index);
if (!IS_ERR(folio)) { if (!IS_ERR(folio)) {
u64 folio_sz = folio_size(folio); u64 folio_sz = folio_size(folio);
u64 offset = offset_in_folio(folio, cur); u64 offset = offset_in_folio(folio, cur);
@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page. * subpage::readers and to unlock the page.
*/ */
if (fs_info->sectorsize < PAGE_SIZE) if (fs_info->sectorsize < PAGE_SIZE)
btrfs_subpage_start_reader(fs_info, folio, cur, btrfs_folio_set_lock(fs_info, folio, cur, add_size);
add_size);
folio_put(folio); folio_put(folio);
cur += add_size; cur += add_size;
} }
@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace); kfree(workspace);
} }
static struct list_head *alloc_heuristic_ws(unsigned int level) static struct list_head *alloc_heuristic_ws(void)
{ {
struct heuristic_ws *ws; struct heuristic_ws *ws;
@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
static struct list_head *alloc_workspace(int type, unsigned int level) static struct list_head *alloc_workspace(int type, unsigned int level)
{ {
switch (type) { switch (type) {
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
default: default:
/* /*
@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
{ {
int type = btrfs_compress_type(type_level); int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level); int level = btrfs_compress_level(type_level);
const unsigned long orig_len = *total_out;
struct list_head *workspace; struct list_head *workspace;
int ret; int ret;
@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
workspace = get_workspace(type, level); workspace = get_workspace(type, level);
ret = compression_compress_pages(type, workspace, mapping, start, folios, ret = compression_compress_pages(type, workspace, mapping, start, folios,
out_folios, total_in, total_out); out_folios, total_in, total_out);
/* The total read-in bytes should be no larger than the input. */
ASSERT(*total_in <= orig_len);
put_workspace(type, workspace); put_workspace(type, workspace);
return ret; return ret;
} }

View File

@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in, int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen); size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level); struct list_head *lzo_alloc_workspace(void);
void lzo_free_workspace(struct list_head *ws); void lzo_free_workspace(struct list_head *ws);
int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,

View File

@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
*/ */
static int static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct extent_buffer **eb_ret, int level, int slot, struct extent_buffer **eb_ret, int slot,
const struct btrfs_key *key) const struct btrfs_key *key)
{ {
struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 }; struct btrfs_tree_parent_check check = { 0 };
u64 blocknr; u64 blocknr;
u64 gen; struct extent_buffer *tmp = NULL;
struct extent_buffer *tmp; int ret = 0;
int ret;
int parent_level; int parent_level;
bool unlock_up; int err;
bool read_tmp = false;
bool tmp_locked = false;
bool path_released = false;
unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot); blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret); parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true; check.has_first_key = true;
check.level = parent_level - 1; check.level = parent_level - 1;
check.transid = gen; check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
check.owner_root = btrfs_root_id(root); check.owner_root = btrfs_root_id(root);
/* /*
@ -1540,80 +1540,116 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr); tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) { if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS) if (p->reada == READA_FORWARD_ALWAYS)
reada_for_search(fs_info, p, level, slot, key->objectid); reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */ /* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
/* /*
* Do extra check for first_key, eb can be stale due to * Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple * being cached, read from scrub, or have multiple
* parents (shared tree blocks). * parents (shared tree blocks).
*/ */
if (btrfs_verify_level_key(tmp, if (btrfs_verify_level_key(tmp, &check)) {
parent_level - 1, &check.first_key, gen)) { ret = -EUCLEAN;
free_extent_buffer(tmp); goto out;
return -EUCLEAN;
} }
*eb_ret = tmp; *eb_ret = tmp;
return 0; tmp = NULL;
ret = 0;
goto out;
} }
if (p->nowait) { if (p->nowait) {
free_extent_buffer(tmp);
return -EAGAIN;
}
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_extent_buffer(tmp, &check);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return ret;
}
if (unlock_up)
ret = -EAGAIN; ret = -EAGAIN;
goto out;
}
if (!p->skip_locking) {
btrfs_unlock_up_safe(p, parent_level + 1);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
ret = -EAGAIN;
path_released = true;
}
/* Now we're allowed to do a blocking uptodate check. */
err = btrfs_read_extent_buffer(tmp, &check);
if (err) {
ret = err;
goto out;
}
if (ret == 0) {
ASSERT(!tmp_locked);
*eb_ret = tmp;
tmp = NULL;
}
goto out; goto out;
} else if (p->nowait) { } else if (p->nowait) {
return -EAGAIN; ret = -EAGAIN;
goto out;
} }
if (unlock_up) { if (!p->skip_locking) {
btrfs_unlock_up_safe(p, level + 1); btrfs_unlock_up_safe(p, parent_level + 1);
ret = -EAGAIN; ret = -EAGAIN;
} else {
ret = 0;
} }
if (p->reada != READA_NONE) if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid); reada_for_search(fs_info, p, parent_level, slot, key->objectid);
tmp = read_tree_block(fs_info, blocknr, &check); tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
if (IS_ERR(tmp)) { if (IS_ERR(tmp)) {
btrfs_release_path(p); ret = PTR_ERR(tmp);
return PTR_ERR(tmp); tmp = NULL;
goto out;
} }
read_tmp = true;
if (!p->skip_locking) {
ASSERT(ret == -EAGAIN);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
path_released = true;
}
/* Now we're allowed to do a blocking uptodate check. */
err = btrfs_read_extent_buffer(tmp, &check);
if (err) {
ret = err;
goto out;
}
/* /*
* If the read above didn't mark this buffer up to date, * If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now * it will never end up being up to date. Set ret to EIO now
* and give up so that our caller doesn't loop forever * and give up so that our caller doesn't loop forever
* on our EAGAINs. * on our EAGAINs.
*/ */
if (!extent_buffer_uptodate(tmp)) if (!extent_buffer_uptodate(tmp)) {
ret = -EIO; ret = -EIO;
goto out;
out:
if (ret == 0) {
*eb_ret = tmp;
} else {
free_extent_buffer(tmp);
btrfs_release_path(p);
} }
if (ret == 0) {
ASSERT(!tmp_locked);
*eb_ret = tmp;
tmp = NULL;
}
out:
if (tmp) {
if (tmp_locked)
btrfs_tree_read_unlock(tmp);
if (read_tmp && ret && ret != -EAGAIN)
free_extent_buffer_stale(tmp);
else
free_extent_buffer(tmp);
}
if (ret && !path_released)
btrfs_release_path(p);
return ret; return ret;
} }
@ -2197,8 +2233,8 @@ cow_done:
goto done; goto done;
} }
err = read_block_for_search(root, p, &b, level, slot, key); err = read_block_for_search(root, p, &b, slot, key);
if (err == -EAGAIN) if (err == -EAGAIN && !p->nowait)
goto again; goto again;
if (err) { if (err) {
ret = err; ret = err;
@ -2324,8 +2360,8 @@ again:
goto done; goto done;
} }
err = read_block_for_search(root, p, &b, level, slot, key); err = read_block_for_search(root, p, &b, slot, key);
if (err == -EAGAIN) if (err == -EAGAIN && !p->nowait)
goto again; goto again;
if (err) { if (err) {
ret = err; ret = err;
@ -2334,7 +2370,7 @@ again:
level = btrfs_header_level(b); level = btrfs_header_level(b);
btrfs_tree_read_lock(b); btrfs_tree_read_lock(b);
b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
if (!b) { if (!b) {
ret = -ENOMEM; ret = -ENOMEM;
goto done; goto done;
@ -4930,8 +4966,7 @@ again:
} }
next = c; next = c;
ret = read_block_for_search(root, path, &next, level, ret = read_block_for_search(root, path, &next, slot, &key);
slot, &key);
if (ret == -EAGAIN && !path->nowait) if (ret == -EAGAIN && !path->nowait)
goto again; goto again;
@ -4974,8 +5009,7 @@ again:
if (!level) if (!level)
break; break;
ret = read_block_for_search(root, path, &next, level, ret = read_block_for_search(root, path, &next, 0, &key);
0, &key);
if (ret == -EAGAIN && !path->nowait) if (ret == -EAGAIN && !path->nowait)
goto again; goto again;

View File

@ -64,9 +64,9 @@ struct btrfs_delayed_node {
struct mutex mutex; struct mutex mutex;
struct btrfs_inode_item inode_item; struct btrfs_inode_item inode_item;
refcount_t refs; refcount_t refs;
int count;
u64 index_cnt; u64 index_cnt;
unsigned long flags; unsigned long flags;
int count;
/* /*
* The size of the next batch of dir index items to insert (if this * The size of the next batch of dir index items to insert (if this
* node is from a directory inode). Protected by @mutex. * node is from a directory inode). Protected by @mutex.

View File

@ -9,6 +9,7 @@
#include "messages.h" #include "messages.h"
#include "ctree.h" #include "ctree.h"
#include "delayed-ref.h" #include "delayed-ref.h"
#include "extent-tree.h"
#include "transaction.h" #include "transaction.h"
#include "qgroup.h" #include "qgroup.h"
#include "space-info.h" #include "space-info.h"
@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0; return 0;
} }
/* insert a new ref to head ref rbtree */
static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
} else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
leftmost = false;
} else {
return entry;
}
}
rb_link_node(node, parent_node, p);
rb_insert_color_cached(node, root, leftmost);
return NULL;
}
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins) struct btrfs_delayed_ref_node *ins)
{ {
@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
static struct btrfs_delayed_ref_head *find_first_ref_head( static struct btrfs_delayed_ref_head *find_first_ref_head(
struct btrfs_delayed_ref_root *dr) struct btrfs_delayed_ref_root *dr)
{ {
struct rb_node *n; unsigned long from = 0;
struct btrfs_delayed_ref_head *entry;
n = rb_first_cached(&dr->href_root); lockdep_assert_held(&dr->lock);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT);
return entry;
} }
/* static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
* Find a head entry based on bytenr. This returns the delayed ref head if it
* was able to find one, or NULL if nothing was in that spot. If return_bigger
* is given, the next bigger entry is returned if no exact match is found.
*/
static struct btrfs_delayed_ref_head *find_ref_head(
struct btrfs_delayed_ref_root *dr, u64 bytenr,
bool return_bigger)
{
struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
n = root->rb_node;
entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
if (bytenr < entry->bytenr)
n = n->rb_left;
else if (bytenr > entry->bytenr)
n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) {
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
return NULL;
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
}
return entry;
}
return NULL;
}
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head *head)
{ {
lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex)) if (mutex_trylock(&head->mutex))
return 0; return true;
refcount_inc(&head->refs); refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex); mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
if (RB_EMPTY_NODE(&head->href_node)) { if (!head->tracked) {
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head); btrfs_put_delayed_ref_head(head);
return -EAGAIN; return false;
} }
btrfs_put_delayed_ref_head(head); btrfs_put_delayed_ref_head(head);
return 0; return true;
} }
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
if (!list_empty(&ref->add_list)) if (!list_empty(&ref->add_list))
list_del(&ref->add_list); list_del(&ref->add_list);
btrfs_put_delayed_ref(ref); btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1, 0); btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
} }
@ -558,52 +482,74 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
} }
struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_head *btrfs_select_ref_head(
const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs) struct btrfs_delayed_ref_root *delayed_refs)
{ {
struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_head *head;
unsigned long start_index;
unsigned long found_index;
bool found_head = false;
bool locked;
lockdep_assert_held(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
again: again:
head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits);
true); xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) {
if (!head && delayed_refs->run_delayed_start != 0) { if (!head->processing) {
delayed_refs->run_delayed_start = 0; found_head = true;
head = find_first_ref_head(delayed_refs); break;
} }
if (!head) }
return NULL; if (!found_head) {
if (delayed_refs->run_delayed_start == 0) {
while (head->processing) { spin_unlock(&delayed_refs->lock);
struct rb_node *node;
node = rb_next(&head->href_node);
if (!node) {
if (delayed_refs->run_delayed_start == 0)
return NULL; return NULL;
}
delayed_refs->run_delayed_start = 0; delayed_refs->run_delayed_start = 0;
goto again; goto again;
} }
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
}
head->processing = true; head->processing = true;
WARN_ON(delayed_refs->num_heads_ready == 0); WARN_ON(delayed_refs->num_heads_ready == 0);
delayed_refs->num_heads_ready--; delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr + delayed_refs->run_delayed_start = head->bytenr +
head->num_bytes; head->num_bytes;
locked = btrfs_delayed_ref_lock(delayed_refs, head);
spin_unlock(&delayed_refs->lock);
/*
* We may have dropped the spin lock to get the head mutex lock, and
* that might have given someone else time to free the head. If that's
* true, it has been removed from our list and we can move on.
*/
if (!locked)
return ERR_PTR(-EAGAIN);
return head; return head;
} }
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head *head)
{ {
spin_lock(&delayed_refs->lock);
head->processing = false;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(head);
}
void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits);
lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&delayed_refs->lock);
lockdep_assert_held(&head->lock); lockdep_assert_held(&head->lock);
rb_erase_cached(&head->href_node, &delayed_refs->href_root); xa_erase(&delayed_refs->head_refs, index);
RB_CLEAR_NODE(&head->href_node); head->tracked = false;
atomic_dec(&delayed_refs->num_entries);
delayed_refs->num_heads--; delayed_refs->num_heads--;
if (!head->processing) if (!head->processing)
delayed_refs->num_heads_ready--; delayed_refs->num_heads_ready--;
@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
if (!exist) { if (!exist) {
if (ref->action == BTRFS_ADD_DELAYED_REF) if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list); list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
spin_unlock(&href->lock); spin_unlock(&href->lock);
trans->delayed_ref_updates++; trans->delayed_ref_updates++;
return false; return false;
@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED; head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list); INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node); head_ref->tracked = false;
head_ref->processing = false; head_ref->processing = false;
head_ref->total_ref_mod = count_mod; head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock); spin_lock_init(&head_ref->lock);
@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
qrecord->data_rsv = reserved; qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = generic_ref->ref_root; qrecord->data_rsv_refroot = generic_ref->ref_root;
} }
qrecord->bytenr = generic_ref->bytenr;
qrecord->num_bytes = generic_ref->num_bytes; qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL; qrecord->old_roots = NULL;
} }
@ -852,19 +796,33 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_root *delayed_refs;
const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
bool qrecord_inserted = false; bool qrecord_inserted = false;
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
#if BITS_PER_LONG == 32
if (head_ref->bytenr >= MAX_LFS_FILESIZE) {
if (qrecord)
xa_release(&delayed_refs->dirty_extents, index);
btrfs_err_rl(fs_info,
"delayed ref head %llu is beyond 32bit page cache and xarray index limit",
head_ref->bytenr);
btrfs_err_32bit_limit(fs_info);
return ERR_PTR(-EOVERFLOW);
}
#endif
/* Record qgroup extent info if provided */ /* Record qgroup extent info if provided */
if (qrecord) { if (qrecord) {
int ret; int ret;
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
head_ref->bytenr);
if (ret) { if (ret) {
/* Clean up if insertion fails or item exists. */ /* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, xa_release(&delayed_refs->dirty_extents, index);
qrecord->bytenr >> fs_info->sectorsize_bits);
/* Caller responsible for freeing qrecord on error. */ /* Caller responsible for freeing qrecord on error. */
if (ret < 0) if (ret < 0)
return ERR_PTR(ret); return ERR_PTR(ret);
@ -876,8 +834,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
trace_add_delayed_ref_head(fs_info, head_ref, action); trace_add_delayed_ref_head(fs_info, head_ref, action);
existing = htree_insert(&delayed_refs->href_root, existing = xa_load(&delayed_refs->head_refs, index);
&head_ref->href_node);
if (existing) { if (existing) {
update_existing_head_ref(trans, existing, head_ref); update_existing_head_ref(trans, existing, head_ref);
/* /*
@ -887,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing; head_ref = existing;
} else { } else {
existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC);
if (xa_is_err(existing)) {
/* Memory was preallocated by the caller. */
ASSERT(xa_err(existing) != -ENOMEM);
return ERR_PTR(xa_err(existing));
} else if (WARN_ON(existing)) {
/*
* Shouldn't happen we just did a lookup before under
* delayed_refs->lock.
*/
return ERR_PTR(-EEXIST);
}
head_ref->tracked = true;
/* /*
* We reserve the amount of bytes needed to delete csums when * We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs * adding the ref head and not when adding individual drop refs
@ -900,7 +870,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
} }
delayed_refs->num_heads++; delayed_refs->num_heads++;
delayed_refs->num_heads_ready++; delayed_refs->num_heads_ready++;
atomic_inc(&delayed_refs->num_entries);
} }
if (qrecord_inserted_ret) if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted; *qrecord_inserted_ret = qrecord_inserted;
@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *new_head_ref; struct btrfs_delayed_ref_head *new_head_ref;
struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL; struct btrfs_qgroup_extent_record *record = NULL;
const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits);
bool qrecord_reserved = false;
bool qrecord_inserted; bool qrecord_inserted;
int action = generic_ref->action; int action = generic_ref->action;
bool merged; bool merged;
@ -1023,25 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
goto free_node; goto free_node;
} }
delayed_refs = &trans->transaction->delayed_refs;
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS); record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) { if (!record) {
ret = -ENOMEM; ret = -ENOMEM;
goto free_head_ref; goto free_head_ref;
} }
if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
generic_ref->bytenr >> fs_info->sectorsize_bits,
GFP_NOFS)) {
ret = -ENOMEM; ret = -ENOMEM;
goto free_record; goto free_record;
} }
qrecord_reserved = true;
}
ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
if (ret) {
if (qrecord_reserved)
xa_release(&delayed_refs->dirty_extents, index);
goto free_record;
} }
init_delayed_ref_common(fs_info, node, generic_ref); init_delayed_ref_common(fs_info, node, generic_ref);
init_delayed_ref_head(head_ref, generic_ref, record, reserved); init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op; head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
/* /*
@ -1051,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
new_head_ref = add_delayed_ref_head(trans, head_ref, record, new_head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted); action, &qrecord_inserted);
if (IS_ERR(new_head_ref)) { if (IS_ERR(new_head_ref)) {
xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref); ret = PTR_ERR(new_head_ref);
goto free_record; goto free_record;
@ -1074,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_node_cachep, node); kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted) if (qrecord_inserted)
return btrfs_qgroup_trace_extent_post(trans, record); return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
return 0; return 0;
free_record: free_record:
@ -1113,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u8 level, u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op) struct btrfs_delayed_extent_op *extent_op)
{ {
const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits);
struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_head *head_ref_ret; struct btrfs_delayed_ref_head *head_ref_ret;
struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_root *delayed_refs;
@ -1123,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.num_bytes = num_bytes, .num_bytes = num_bytes,
.tree_ref.level = level, .tree_ref.level = level,
}; };
int ret;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref) if (!head_ref)
@ -1132,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
head_ref->extent_op = extent_op; head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
if (ret) {
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return ret;
}
spin_lock(&delayed_refs->lock);
head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
BTRFS_UPDATE_DELAYED_HEAD, NULL); BTRFS_UPDATE_DELAYED_HEAD, NULL);
spin_unlock(&delayed_refs->lock);
if (IS_ERR(head_ref_ret)) { if (IS_ERR(head_ref_ret)) {
xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return PTR_ERR(head_ref_ret); return PTR_ERR(head_ref_ret);
} }
spin_unlock(&delayed_refs->lock);
/* /*
* Need to update the delayed_refs_rsv with any changes we may have * Need to update the delayed_refs_rsv with any changes we may have
@ -1164,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
* head node if found, or NULL if not. * head node if found, or NULL if not.
*/ */
struct btrfs_delayed_ref_head * struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr)
{ {
const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
lockdep_assert_held(&delayed_refs->lock); lockdep_assert_held(&delayed_refs->lock);
return find_ref_head(delayed_refs, bytenr, false); return xa_load(&delayed_refs->head_refs, index);
} }
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
@ -1238,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
return found; return found;
} }
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
{
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
spin_lock(&delayed_refs->lock);
while (true) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
head = find_first_ref_head(delayed_refs);
if (!head)
break;
if (!btrfs_delayed_ref_lock(delayed_refs, head))
continue;
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
struct btrfs_delayed_ref_node *ref;
ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
drop_delayed_ref(fs_info, delayed_refs, head, ref);
}
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
btrfs_delete_ref_head(fs_info, delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
if (WARN_ON_ONCE(bg == NULL)) {
/*
* Unexpected and there's nothing we can do here
* because we are in a transaction abort path,
* so any errors can only be ignored or reported
* while attempting to cleanup all resources.
*/
btrfs_err(fs_info,
"block group for delayed ref at %llu was not found while destroying ref head",
head->bytenr);
} else {
spin_lock(&bg->space_info->lock);
spin_lock(&bg->lock);
bg->pinned += head->num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info,
bg->space_info,
head->num_bytes);
bg->reserved -= head->num_bytes;
bg->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&bg->lock);
spin_unlock(&bg->space_info->lock);
btrfs_put_block_group(bg);
}
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
}
void __cold btrfs_delayed_ref_exit(void) void __cold btrfs_delayed_ref_exit(void)
{ {
kmem_cache_destroy(btrfs_delayed_ref_head_cachep); kmem_cache_destroy(btrfs_delayed_ref_head_cachep);

View File

@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node {
/* /*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to * If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the * ref_head->ref_add_list, then we do not need to iterate the
* whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. * refs rbtree in the corresponding delayed ref head
* (struct btrfs_delayed_ref_head::ref_tree).
*/ */
struct list_head add_list; struct list_head add_list;
@ -122,12 +123,6 @@ struct btrfs_delayed_extent_op {
struct btrfs_delayed_ref_head { struct btrfs_delayed_ref_head {
u64 bytenr; u64 bytenr;
u64 num_bytes; u64 num_bytes;
/*
* For insertion into struct btrfs_delayed_ref_root::href_root.
* Keep it in the same cache line as 'bytenr' for more efficient
* searches in the rbtree.
*/
struct rb_node href_node;
/* /*
* the mutex is held while running the refs, and it is also * the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications. * held when checking the sum of reference modifications.
@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head {
bool is_data; bool is_data;
bool is_system; bool is_system;
bool processing; bool processing;
/*
* Indicate if it's currently in the data structure that tracks head
* refs (struct btrfs_delayed_ref_root::head_refs).
*/
bool tracked;
}; };
enum btrfs_delayed_ref_flags { enum btrfs_delayed_ref_flags {
@ -199,38 +199,52 @@ enum btrfs_delayed_ref_flags {
}; };
struct btrfs_delayed_ref_root { struct btrfs_delayed_ref_root {
/* head ref rbtree */
struct rb_root_cached href_root;
/* /*
* Track dirty extent records. * Track head references.
* The keys correspond to the logical address of the extent ("bytenr") * The keys correspond to the logical address of the extent ("bytenr")
* right shifted by fs_info->sectorsize_bits. This is both to get a more * right shifted by fs_info->sectorsize_bits. This is both to get a more
* dense index space (optimizes xarray structure) and because indexes in * dense index space (optimizes xarray structure) and because indexes in
* xarrays are of "unsigned long" type, meaning they are 32 bits wide on * xarrays are of "unsigned long" type, meaning they are 32 bits wide on
* 32 bits platforms, limiting the extent range to 4G which is too low * 32 bits platforms, limiting the extent range to 4G which is too low
* and makes it unusable (truncated index values) on 32 bits platforms. * and makes it unusable (truncated index values) on 32 bits platforms.
* Protected by the spinlock 'lock' defined below.
*/
struct xarray head_refs;
/*
* Track dirty extent records.
* The keys correspond to the logical address of the extent ("bytenr")
* right shifted by fs_info->sectorsize_bits, for same reasons as above.
*/ */
struct xarray dirty_extents; struct xarray dirty_extents;
/* this spin lock protects the rbtree and the entries inside */ /*
* Protects the xarray head_refs, its entries and the following fields:
* num_heads, num_heads_ready, pending_csums and run_delayed_start.
*/
spinlock_t lock; spinlock_t lock;
/* how many delayed ref updates we've queued, used by the /* Total number of head refs, protected by the spinlock 'lock'. */
* throttling code
*/
atomic_t num_entries;
/* total number of head nodes in tree */
unsigned long num_heads; unsigned long num_heads;
/* total number of head nodes ready for processing */ /*
* Total number of head refs ready for processing, protected by the
* spinlock 'lock'.
*/
unsigned long num_heads_ready; unsigned long num_heads_ready;
/*
* Track space reserved for deleting csums of data extents.
* Protected by the spinlock 'lock'.
*/
u64 pending_csums; u64 pending_csums;
unsigned long flags; unsigned long flags;
/*
* Track from which bytenr to start searching ref heads.
* Protected by the spinlock 'lock'.
*/
u64 run_delayed_start; u64 run_delayed_start;
/* /*
@ -372,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head * struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr); u64 bytenr);
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{ {
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
} }
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_head *btrfs_select_ref_head(
const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs); struct btrfs_delayed_ref_root *delayed_refs);
void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
@ -399,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent); u64 root, u64 parent);
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{ {

View File

@ -45,7 +45,7 @@
* *
* - Copy existing extents * - Copy existing extents
* *
* This happens by re-using scrub facility, as scrub also iterates through * This happens by reusing scrub facility, as scrub also iterates through
* existing extents from commit root. * existing extents from commit root.
* *
* Location: scrub_write_block_to_dev_replace() from * Location: scrub_write_block_to_dev_replace() from
@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret; return ret;
down_write(&dev_replace->rwsem); down_write(&dev_replace->rwsem);
dev_replace->replace_task = current;
switch (dev_replace->replace_state) { switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@ -994,6 +995,7 @@ error:
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->rw_devices++; fs_devices->rw_devices++;
dev_replace->replace_task = NULL;
up_write(&dev_replace->rwsem); up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_blocked(fs_info);

View File

@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
const char *name, const char *name,
int name_len) int name_len)
{ {
struct btrfs_fs_info *fs_info = root->fs_info;
int ret; int ret;
char *ptr; char *ptr;
struct extent_buffer *leaf; struct extent_buffer *leaf;
@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) { if (ret == -EEXIST) {
struct btrfs_dir_item *di; struct btrfs_dir_item *di;
di = btrfs_match_dir_item_name(fs_info, path, name, name_len); di = btrfs_match_dir_item_name(path, name, name_len);
if (di) if (di)
return ERR_PTR(-EEXIST); return ERR_PTR(-EEXIST);
btrfs_extend_item(trans, path, data_size); btrfs_extend_item(trans, path, data_size);
@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
if (ret > 0) if (ret > 0)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); return btrfs_match_dir_item_name(path, name, name_len);
} }
/* /*
@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break; break;
di = btrfs_match_dir_item_name(root->fs_info, path, di = btrfs_match_dir_item_name(path, name->name, name->len);
name->name, name->len);
if (di) if (di)
return di; return di;
} }
@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one * this walks through all the entries in a dir item and finds one
* for a specific name. * for a specific name.
*/ */
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const struct btrfs_path *path,
const char *name, int name_len) const char *name, int name_len)
{ {
struct btrfs_dir_item *dir_item; struct btrfs_dir_item *dir_item;

View File

@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir, struct btrfs_path *path, u64 dir,
const char *name, u16 name_len, const char *name, u16 name_len,
int mod); int mod);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const struct btrfs_path *path,
const char *name, const char *name,
int name_len); int name_len);

View File

@ -834,7 +834,7 @@ relock:
return ret; return ret;
} }
ret = btrfs_write_check(iocb, from, ret); ret = btrfs_write_check(iocb, ret);
if (ret < 0) { if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out; goto out;

View File

@ -917,8 +917,7 @@ fail:
return ERR_PTR(ret); return ERR_PTR(ret);
} }
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
struct btrfs_fs_info *fs_info)
{ {
struct btrfs_root *root; struct btrfs_root *root;
@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
{ {
struct btrfs_root *log_root; struct btrfs_root *log_root;
log_root = alloc_log_tree(trans, fs_info); log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root)) if (IS_ERR(log_root))
return PTR_ERR(log_root); return PTR_ERR(log_root);
@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item; struct btrfs_inode_item *inode_item;
int ret; int ret;
log_root = alloc_log_tree(trans, fs_info); log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root)) if (IS_ERR(log_root))
return PTR_ERR(log_root); return PTR_ERR(log_root);
@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info); btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info); btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info); btrfs_init_async_reclaim_work(fs_info);
btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock); rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED; fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret) if (ret)
return ret; return ret;
spin_lock_init(&fs_info->extent_map_shrinker_lock);
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) if (ret)
return ret; return ret;
@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
return 0; return 0;
} }
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
const char *options)
{ {
u32 sectorsize; u32 sectorsize;
u32 nodesize; u32 nodesize;
@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, btrfs_warn(fs_info,
"transaction %llu (with %llu dirty metadata bytes) is not committed", "transaction %llu (with %llu dirty metadata bytes) is not committed",
trans->transid, dirty_bytes); trans->transid, dirty_bytes);
btrfs_cleanup_one_transaction(trans, fs_info); btrfs_cleanup_one_transaction(trans);
if (trans == fs_info->running_transaction) if (trans == fs_info->running_transaction)
fs_info->running_transaction = NULL; fs_info->running_transaction = NULL;
@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
/* Cancel or finish ongoing discard work */ /* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info); btrfs_discard_cleanup(fs_info);
@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
} }
static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
spin_lock(&delayed_refs->lock);
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
head = rb_entry(node, struct btrfs_delayed_ref_head,
href_node);
if (btrfs_delayed_ref_lock(delayed_refs, head))
continue;
spin_lock(&head->lock);
while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
atomic_dec(&delayed_refs->num_entries);
btrfs_put_delayed_ref(ref);
btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
if (head->must_insert_reserved)
pin_bytes = true;
btrfs_free_delayed_extent_op(head->extent_op);
btrfs_delete_ref_head(delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes) {
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, head->bytenr);
BUG_ON(!cache);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += head->num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info,
cache->space_info, head->num_bytes);
cache->reserved -= head->num_bytes;
cache->space_info->bytes_reserved -= head->num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
}
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{ {
struct btrfs_inode *btrfs_inode; struct btrfs_inode *btrfs_inode;
@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock); spin_unlock(&fs_info->fs_roots_radix_lock);
} }
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
struct btrfs_fs_info *fs_info)
{ {
struct btrfs_fs_info *fs_info = cur_trans->fs_info;
struct btrfs_device *dev, *tmp; struct btrfs_device *dev, *tmp;
btrfs_cleanup_dirty_bgs(cur_trans, fs_info); btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
list_del_init(&dev->post_commit_list); list_del_init(&dev->post_commit_list);
} }
btrfs_destroy_delayed_refs(cur_trans, fs_info); btrfs_destroy_delayed_refs(cur_trans);
cur_trans->state = TRANS_STATE_COMMIT_START; cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait); wake_up(&fs_info->transaction_blocked_wait);
@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
} else { } else {
spin_unlock(&fs_info->trans_lock); spin_unlock(&fs_info->trans_lock);
} }
btrfs_cleanup_one_transaction(t, fs_info); btrfs_cleanup_one_transaction(t);
spin_lock(&fs_info->trans_lock); spin_lock(&fs_info->trans_lock);
if (t == fs_info->running_transaction) if (t == fs_info->running_transaction)

View File

@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb); const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
const char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info); void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(const struct btrfs_fs_info *fs_info, int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num); const struct btrfs_super_block *sb, int mirror_num);
@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root); struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info); struct btrfs_fs_info *fs_info);
void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans);
struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid); u64 objectid);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);

View File

@ -182,7 +182,7 @@ search_again:
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (head) { if (head) {
if (!mutex_trylock(&head->mutex)) { if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs); refcount_inc(&head->refs);
@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
if (insert) { if (insert) {
extra_size = btrfs_extent_inline_ref_size(want); extra_size = btrfs_extent_inline_ref_size(want);
path->search_for_extension = 1; path->search_for_extension = 1;
path->keep_locks = 1;
} else } else
extra_size = -1; extra_size = -1;
@ -946,6 +945,25 @@ again:
ret = -EAGAIN; ret = -EAGAIN;
goto out; goto out;
} }
if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) {
struct btrfs_key tmp_key;
btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1);
if (tmp_key.objectid == bytenr &&
tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
ret = -EAGAIN;
goto out;
}
goto out_no_entry;
}
if (!path->keep_locks) {
btrfs_release_path(path);
path->keep_locks = 1;
goto again;
}
/* /*
* To add new inline back ref, we have to make sure * To add new inline back ref, we have to make sure
* there is no corresponding back ref item. * there is no corresponding back ref item.
@ -959,13 +977,15 @@ again:
goto out; goto out;
} }
} }
out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr; *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out: out:
if (insert) { if (path->keep_locks) {
path->keep_locks = 0; path->keep_locks = 0;
path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1); btrfs_unlock_up_safe(path, 1);
} }
if (insert)
path->search_for_extension = 0;
return ret; return ret;
} }
@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return ref; return ref;
} }
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
spin_lock(&delayed_refs->lock);
head->processing = false;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(head);
}
static struct btrfs_delayed_extent_op *cleanup_extent_op( static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head) struct btrfs_delayed_ref_head *head)
{ {
@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
ret = run_and_cleanup_extent_op(trans, head); ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) { if (ret < 0) {
unselect_delayed_ref_head(delayed_refs, head); btrfs_unselect_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
return ret; return ret;
} else if (ret) { } else if (ret) {
@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
return 1; return 1;
} }
btrfs_delete_ref_head(delayed_refs, head); btrfs_delete_ref_head(fs_info, delayed_refs, head);
spin_unlock(&head->lock); spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return ret; return ret;
} }
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
struct btrfs_trans_handle *trans)
{
struct btrfs_delayed_ref_root *delayed_refs =
&trans->transaction->delayed_refs;
struct btrfs_delayed_ref_head *head = NULL;
int ret;
spin_lock(&delayed_refs->lock);
head = btrfs_select_ref_head(delayed_refs);
if (!head) {
spin_unlock(&delayed_refs->lock);
return head;
}
/*
* Grab the lock that says we are going to process all the refs for
* this head
*/
ret = btrfs_delayed_ref_lock(delayed_refs, head);
spin_unlock(&delayed_refs->lock);
/*
* We may have dropped the spin lock to get the head mutex lock, and
* that might have given someone else time to free the head. If that's
* true, it has been removed from our list and we can move on.
*/
if (ret == -EAGAIN)
head = ERR_PTR(-EAGAIN);
return head;
}
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref, struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released) u64 *bytes_released)
@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
if (ref->seq && if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) { btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock); spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_unselect_ref_head(delayed_refs, locked_ref);
return -EAGAIN; return -EAGAIN;
} }
@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
default: default:
WARN_ON(1); WARN_ON(1);
} }
atomic_dec(&delayed_refs->num_entries);
/* /*
* Record the must_insert_reserved flag before we drop the * Record the must_insert_reserved flag before we drop the
@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op); btrfs_free_delayed_extent_op(extent_op);
if (ret) { if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_unselect_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref); btrfs_put_delayed_ref(ref);
return ret; return ret;
} }
@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
do { do {
if (!locked_ref) { if (!locked_ref) {
locked_ref = btrfs_obtain_ref_head(trans); locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
if (IS_ERR_OR_NULL(locked_ref)) { if (IS_ERR_OR_NULL(locked_ref)) {
if (PTR_ERR(locked_ref) == -EAGAIN) { if (PTR_ERR(locked_ref) == -EAGAIN) {
continue; continue;
@ -2220,7 +2196,7 @@ again:
btrfs_create_pending_block_groups(trans); btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { if (xa_empty(&delayed_refs->head_refs)) {
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
return 0; return 0;
} }
@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
delayed_refs = &cur_trans->delayed_refs; delayed_refs = &cur_trans->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head) { if (!head) {
spin_unlock(&delayed_refs->lock); spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans);
@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
break; break;
} }
/* Quick path didn't find the EXTEMT/METADATA_ITEM */ /* Quick path didn't find the EXTENT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5) if (path->slots[0] - extent_slot > 5)
break; break;
extent_slot--; extent_slot--;
@ -3377,13 +3353,14 @@ out:
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
u64 bytenr) u64 bytenr)
{ {
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_root *delayed_refs;
int ret = 0; int ret = 0;
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (!head) if (!head)
goto out_delayed_unlock; goto out_delayed_unlock;
@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex)) if (!mutex_trylock(&head->mutex))
goto out; goto out;
btrfs_delete_ref_head(delayed_refs, head); btrfs_delete_ref_head(fs_info, delayed_refs, head);
head->processing = false; head->processing = false;
spin_unlock(&head->lock); spin_unlock(&head->lock);
@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved) if (head->must_insert_reserved)
ret = 1; ret = 1;
btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head); btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
mutex_unlock(&head->mutex); mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head); btrfs_put_delayed_ref_head(head);
return ret; return ret;
@ -5270,7 +5247,7 @@ struct walk_control {
* corrupted file systems must have been caught before calling this function. * corrupted file systems must have been caught before calling this function.
*/ */
static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
struct extent_buffer *eb, u64 refs, u64 flags, int slot) struct extent_buffer *eb, u64 flags, int slot)
{ {
struct btrfs_key key; struct btrfs_key key;
u64 generation; u64 generation;
@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue; continue;
/* If we don't need to visit this node don't reada. */ /* If we don't need to visit this node don't reada. */
if (!visit_node_for_delete(root, wc, eb, refs, flags, slot)) if (!visit_node_for_delete(root, wc, eb, flags, slot))
continue; continue;
reada: reada:
btrfs_readahead_node_child(eb, slot); btrfs_readahead_node_child(eb, slot);
@ -5518,7 +5495,7 @@ again:
*/ */
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock); spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head) if (!head)
goto out; goto out;
if (!mutex_trylock(&head->mutex)) { if (!mutex_trylock(&head->mutex)) {
@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/* If we don't have to walk into this node skip it. */ /* If we don't have to walk into this node skip it. */
if (!visit_node_for_delete(root, wc, path->nodes[level], if (!visit_node_for_delete(root, wc, path->nodes[level],
wc->refs[level - 1], wc->flags[level - 1], wc->flags[level - 1], path->slots[level]))
path->slots[level]))
goto skip; goto skip;
/* /*

View File

@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (folio != locked_folio && (page_ops & PAGE_UNLOCK)) if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
btrfs_folio_end_writer_lock(fs_info, folio, start, len); btrfs_folio_end_lock(fs_info, folio, start, len);
} }
static void __process_folios_contig(struct address_space *mapping, static void __process_folios_contig(struct address_space *mapping,
@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
range_start = max_t(u64, folio_pos(folio), start); range_start = max_t(u64, folio_pos(folio), start);
range_len = min_t(u64, folio_pos(folio) + folio_size(folio), range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
end + 1) - range_start; end + 1) - range_start;
btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len); btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1; processed_end = range_start + range_len - 1;
} }
@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
if (!btrfs_is_subpage(fs_info, folio->mapping)) if (!btrfs_is_subpage(fs_info, folio->mapping))
folio_unlock(folio); folio_unlock(folio);
else else
btrfs_subpage_end_reader(fs_info, folio, start, len); btrfs_folio_end_lock(fs_info, folio, start, len);
} }
/* /*
@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
return; return;
ASSERT(folio_test_private(folio)); ASSERT(folio_test_private(folio));
btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE); btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
} }
/* /*
@ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret; return ret;
} }
static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
unsigned int start_bit;
unsigned int nbits;
ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
bitmap_set(delalloc_bitmap, start_bit, nbits);
}
static bool find_next_delalloc_bitmap(struct folio *folio,
unsigned long *delalloc_bitmap, u64 start,
u64 *found_start, u32 *found_len)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
const unsigned int bitmap_size = fs_info->sectors_per_page;
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
if (first_set >= bitmap_size)
return false;
*found_start = folio_start + (first_set << fs_info->sectorsize_bits);
first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
*found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
return true;
}
/* /*
* helper for extent_writepage(), doing all of the delayed allocation setup. * helper for extent_writepage(), doing all of the delayed allocation setup.
* *
@ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
const u64 page_start = folio_pos(folio); const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1; const u64 page_end = page_start + folio_size(folio) - 1;
unsigned long delalloc_bitmap = 0;
/* /*
* Save the last found delalloc end. As the delalloc end can go beyond * Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the * page boundary, thus we cannot rely on subpage bitmap to locate the
@ -1131,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_end = page_end; u64 delalloc_end = page_end;
u64 delalloc_to_write = 0; u64 delalloc_to_write = 0;
int ret = 0; int ret = 0;
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */ /* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
@ -1140,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
bio_ctrl->submit_bitmap = 1; bio_ctrl->submit_bitmap = 1;
} }
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
}
/* Lock all (subpage) delalloc ranges inside the folio first. */ /* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) { while (delalloc_start < page_end) {
delalloc_end = page_end; delalloc_end = page_end;
@ -1148,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1; delalloc_start = delalloc_end + 1;
continue; continue;
} }
btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start, set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
min(delalloc_end, page_end) + 1 - min(delalloc_end, page_end) + 1 - delalloc_start);
delalloc_start);
last_delalloc_end = delalloc_end; last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1; delalloc_start = delalloc_end + 1;
} }
@ -1175,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start; found_len = last_delalloc_end + 1 - found_start;
found = true; found = true;
} else { } else {
found = btrfs_subpage_find_writer_locked(fs_info, folio, found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len); delalloc_start, &found_start, &found_len);
} }
if (!found) if (!found)
@ -1314,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
* a folio for a range already written to disk. * a folio for a range already written to disk.
*/ */
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1); btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
/* /*
* Above call should set the whole folio with writeback flag, even * Above call should set the whole folio with writeback flag, even
* just for a single subpage sector. * just for a single subpage sector.
@ -1391,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
goto out; goto out;
submitted_io = true; submitted_io = true;
} }
btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
out: out:
/* /*
* If we didn't submitted any sector (>= i_size), folio dirty get * If we didn't submitted any sector (>= i_size), folio dirty get
@ -1476,7 +1520,7 @@ done:
* Only unlock ranges that are submitted. As there can be some async * Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio. * submitted ranges inside the folio.
*/ */
btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap); btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0); ASSERT(ret <= 0);
return ret; return ret;
} }
@ -2115,7 +2159,27 @@ retry:
continue; continue;
} }
if (wbc->sync_mode != WB_SYNC_NONE) { /*
* For subpage case, compression can lead to mixed
* writeback and dirty flags, e.g:
* 0 32K 64K 96K 128K
* | |//////||/////| |//|
*
* In above case, [32K, 96K) is asynchronously submitted
* for compression, and [124K, 128K) needs to be written back.
*
* If we didn't wait wrtiteback for page 64K, [128K, 128K)
* won't be submitted as the page still has writeback flag
* and will be skipped in the next check.
*
* This mixed writeback and dirty case is only possible for
* subpage case.
*
* TODO: Remove this check after migrating compression to
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
if (folio_test_writeback(folio)) if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0); submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio); folio_wait_writeback(folio);
@ -2200,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
u32 cur_len = cur_end + 1 - cur; u32 cur_len = cur_end + 1 - cur;
struct folio *folio; struct folio *folio;
folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0); folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
/* /*
* This shouldn't happen, the pages are pinned and locked, this * This shouldn't happen, the pages are pinned and locked, this
@ -2233,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
cur, cur_len, !ret); cur, cur_len, !ret);
mapping_set_error(mapping, ret); mapping_set_error(mapping, ret);
} }
btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len); btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0) if (ret < 0)
found_error = true; found_error = true;
next_page: next_page:
@ -2317,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* to drop the page. * to drop the page.
*/ */
static bool try_release_extent_state(struct extent_io_tree *tree, static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio, gfp_t mask) struct folio *folio)
{ {
u64 start = folio_pos(folio); u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1; u64 end = start + PAGE_SIZE - 1;
@ -2428,7 +2492,7 @@ next:
cond_resched(); cond_resched();
} }
} }
return try_release_extent_state(io_tree, folio, mask); return try_release_extent_state(io_tree, folio);
} }
static void __free_extent_buffer(struct extent_buffer *eb) static void __free_extent_buffer(struct extent_buffer *eb)
@ -2442,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
} }
static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) static bool folio_range_has_eb(struct folio *folio)
{ {
struct btrfs_subpage *subpage; struct btrfs_subpage *subpage;
@ -2452,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
subpage = folio_get_private(folio); subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs)) if (atomic_read(&subpage->eb_refs))
return true; return true;
/*
* Even there is no eb refs here, we may still have
* end_folio_read() call relying on page::private.
*/
if (atomic_read(&subpage->readers))
return true;
} }
return false; return false;
} }
@ -2516,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* We can only detach the folio private if there are no other ebs in the * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO. * page range and no unfinished IO.
*/ */
if (!folio_range_has_eb(fs_info, folio)) if (!folio_range_has_eb(folio))
btrfs_detach_subpage(fs_info, folio); btrfs_detach_subpage(fs_info, folio);
spin_unlock(&folio->mapping->i_private_lock); spin_unlock(&folio->mapping->i_private_lock);
@ -3121,7 +3179,7 @@ out:
} }
/* /*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag, * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
* so it can be cleaned up without utlizing page->mapping. * so it can be cleaned up without utilizing page->mapping.
*/ */
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@ -4221,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level) u64 bytenr, u64 owner_root, u64 gen, int level)
{ {
struct btrfs_tree_parent_check check = { struct btrfs_tree_parent_check check = {
.has_first_key = 0,
.level = level, .level = level,
.transid = gen .transid = gen
}; };

View File

@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
return start + len; return start + len;
} }
static void dec_evictable_extent_maps(struct btrfs_inode *inode) static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
{ {
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
rb_erase(&em->rb_node, &inode->extent_tree.root);
RB_CLEAR_NODE(&em->rb_node);
if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps); percpu_counter_dec(&fs_info->evictable_extent_maps);
} }
@ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{ {
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL; struct extent_map *merge = NULL;
struct rb_node *rb; struct rb_node *rb;
@ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->flags |= EXTENT_FLAG_MERGED; em->flags |= EXTENT_FLAG_MERGED;
validate_extent_map(fs_info, em); validate_extent_map(fs_info, em);
rb_erase(&merge->rb_node, &tree->root); remove_em(inode, merge);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge); free_extent_map(merge);
dec_evictable_extent_maps(inode);
} }
} }
@ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
merge_ondisk_extents(em, merge, em); merge_ondisk_extents(em, merge, em);
validate_extent_map(fs_info, em); validate_extent_map(fs_info, em);
rb_erase(&merge->rb_node, &tree->root);
RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation); em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED; em->flags |= EXTENT_FLAG_MERGED;
remove_em(inode, merge);
free_extent_map(merge); free_extent_map(merge);
dec_evictable_extent_maps(inode);
} }
} }
@ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock); lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED); WARN_ON(em->flags & EXTENT_FLAG_PINNED);
rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING)) if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list); list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
dec_evictable_extent_maps(inode); remove_em(inode, em);
} }
static void replace_extent_mapping(struct btrfs_inode *inode, static void replace_extent_mapping(struct btrfs_inode *inode,
@ -1122,13 +1118,12 @@ out_free_pre:
struct btrfs_em_shrink_ctx { struct btrfs_em_shrink_ctx {
long nr_to_scan; long nr_to_scan;
long scanned; long scanned;
u64 last_ino;
u64 last_root;
}; };
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx) static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{ {
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); struct btrfs_fs_info *fs_info = inode->root->fs_info;
const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree; struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0; long nr_dropped = 0;
struct rb_node *node; struct rb_node *node;
@ -1201,7 +1196,8 @@ next:
* lock. This is to avoid slowing other tasks trying to take the * lock. This is to avoid slowing other tasks trying to take the
* lock. * lock.
*/ */
if (need_resched() || rwlock_needbreak(&tree->lock)) if (need_resched() || rwlock_needbreak(&tree->lock) ||
btrfs_fs_closing(fs_info))
break; break;
node = next; node = next;
} }
@ -1213,19 +1209,21 @@ next:
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{ {
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode; struct btrfs_inode *inode;
long nr_dropped = 0; long nr_dropped = 0;
u64 min_ino = ctx->last_ino + 1; u64 min_ino = fs_info->em_shrinker_last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino); inode = btrfs_find_first_inode(root, min_ino);
while (inode) { while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx); nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1; min_ino = btrfs_ino(inode) + 1;
ctx->last_ino = btrfs_ino(inode); fs_info->em_shrinker_last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode); btrfs_add_delayed_iput(inode);
if (ctx->scanned >= ctx->nr_to_scan) if (ctx->scanned >= ctx->nr_to_scan ||
btrfs_fs_closing(inode->root->fs_info))
break; break;
cond_resched(); cond_resched();
@ -1241,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
* inode if there is one or we will find out this was the last * inode if there is one or we will find out this was the last
* one and move to the next root. * one and move to the next root.
*/ */
ctx->last_root = btrfs_root_id(root); fs_info->em_shrinker_last_root = btrfs_root_id(root);
} else { } else {
/* /*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode. * that when processing the next root we start from its first inode.
*/ */
ctx->last_ino = 0; fs_info->em_shrinker_last_ino = 0;
ctx->last_root = btrfs_root_id(root) + 1; fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
} }
return nr_dropped; return nr_dropped;
} }
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{ {
struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx; struct btrfs_em_shrink_ctx ctx;
u64 start_root_id; u64 start_root_id;
u64 next_root_id; u64 next_root_id;
bool cycled = false; bool cycled = false;
long nr_dropped = 0; long nr_dropped = 0;
fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
ctx.scanned = 0; ctx.scanned = 0;
ctx.nr_to_scan = nr_to_scan; ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
/* start_root_id = fs_info->em_shrinker_last_root;
* In case we have multiple tasks running this shrinker, make the next next_root_id = fs_info->em_shrinker_last_root;
* one start from the next inode in case it starts before we finish.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
fs_info->extent_map_shrinker_last_ino++;
ctx.last_root = fs_info->extent_map_shrinker_last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
start_root_id = ctx.last_root;
next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
nr, ctx.last_root,
ctx.last_ino);
} }
while (ctx.scanned < ctx.nr_to_scan) { while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root; struct btrfs_root *root;
unsigned long count; unsigned long count;
@ -1300,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock); spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) { if (start_root_id > 0 && !cycled) {
next_root_id = 0; next_root_id = 0;
ctx.last_root = 0; fs_info->em_shrinker_last_root = 0;
ctx.last_ino = 0; fs_info->em_shrinker_last_ino = 0;
cycled = true; cycled = true;
continue; continue;
} }
@ -1320,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
btrfs_put_root(root); btrfs_put_root(root);
} }
/*
* In case of multiple tasks running this extent map shrinking code this
* isn't perfect but it's simple and silences things like KCSAN. It's
* not possible to know which task made more progress because we can
* cycle back to the first root and first inode if it's not the first
* time the shrinker ran, see the above logic. Also a task that started
* later may finish ealier than another task and made less progress. So
* make this simple and update to the progress of the last task that
* finished, with the occasional possiblity of having two consecutive
* runs of the shrinker process the same inodes.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
fs_info->extent_map_shrinker_last_root = ctx.last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
nr, ctx.last_root,
ctx.last_ino);
} }
return nr_dropped; atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
}
void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
/*
* Do nothing if the shrinker is already running. In case of high memory
* pressure we can have a lot of tasks calling us and all passing the
* same nr_to_scan value, but in reality we may need only to free
* nr_to_scan extent maps (or less). In case we need to free more than
* that, we will be called again by the fs shrinker, so no worries about
* not doing enough work to reclaim memory from extent maps.
* We can also be repeatedly called with the same nr_to_scan value
* simply because the shrinker runs asynchronously and multiple calls
* to this function are made before the shrinker does enough progress.
*
* That's why we set the atomic counter to nr_to_scan only if its
* current value is zero, instead of incrementing the counter by
* nr_to_scan.
*/
if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
return;
queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
}
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
{
atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
} }

View File

@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em, struct extent_map *new_em,
bool modified); bool modified);
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif #endif

View File

@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* we have in the cache is the last delalloc range we * we have in the cache is the last delalloc range we
* found while the file extent item we found can be * found while the file extent item we found can be
* either for a whole delalloc range we previously * either for a whole delalloc range we previously
* emmitted or only a part of that range. * emitted or only a part of that range.
* *
* We have two cases here: * We have two cases here:
* *
@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* cached extent's end. In this case just ignore the * cached extent's end. In this case just ignore the
* current file extent item because we don't want to * current file extent item because we don't want to
* overlap with previous ranges that may have been * overlap with previous ranges that may have been
* emmitted already; * emitted already;
* *
* 2) The file extent item starts behind the currently * 2) The file extent item starts behind the currently
* cached extent but its end offset goes beyond the * cached extent but its end offset goes beyond the
* end offset of the cached extent. We don't want to * end offset of the cached extent. We don't want to
* overlap with a previous range that may have been * overlap with a previous range that may have been
* emmitted already, so we emit the currently cached * emitted already, so we emit the currently cached
* extent and then partially store the current file * extent and then partially store the current file
* extent item's range in the cache, for the subrange * extent item's range in the cache, for the subrange
* going the cached extent's end to the end of the * going the cached extent's end to the end of the

View File

@ -37,33 +37,30 @@
#include "file.h" #include "file.h"
#include "super.h" #include "super.h"
/* simple helper to fault in pages and copy. This should go away /*
* and be replaced with calls into generic code. * Helper to fault in page and copy. This should go away and be replaced with
* calls into generic code.
*/ */
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
struct page **prepared_pages, struct folio *folio, struct iov_iter *i)
struct iov_iter *i)
{ {
size_t copied = 0; size_t copied = 0;
size_t total_copied = 0; size_t total_copied = 0;
int pg = 0;
int offset = offset_in_page(pos); int offset = offset_in_page(pos);
while (write_bytes > 0) { while (write_bytes > 0) {
size_t count = min_t(size_t, size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
PAGE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
/* /*
* Copy data from userspace to the current page * Copy data from userspace to the current page
*/ */
copied = copy_page_from_iter_atomic(page, offset, count, i); copied = copy_folio_from_iter_atomic(folio, offset, count, i);
/* Flush processor's dcache for this page */ /* Flush processor's dcache for this page */
flush_dcache_page(page); flush_dcache_folio(folio);
/* /*
* if we get a partial write, we can end up with * if we get a partial write, we can end up with
* partially up to date pages. These add * partially up to date page. These add
* a lot of complexity, so make sure they don't * a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried. * happen by forcing this copy to be retried.
* *
@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* back to page at a time copies after we return 0. * back to page at a time copies after we return 0.
*/ */
if (unlikely(copied < count)) { if (unlikely(copied < count)) {
if (!PageUptodate(page)) { if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied); iov_iter_revert(i, copied);
copied = 0; copied = 0;
} }
@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
write_bytes -= copied; write_bytes -= copied;
total_copied += copied; total_copied += copied;
offset += copied; offset += copied;
if (offset == PAGE_SIZE) {
pg++;
offset = 0;
}
} }
return total_copied; return total_copied;
} }
/* /*
* unlocks pages after btrfs_file_write is done with them * Unlock folio after btrfs_file_write() is done with it.
*/ */
static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
struct page **pages, size_t num_pages,
u64 pos, u64 copied) u64 pos, u64 copied)
{ {
size_t i;
u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_start = round_down(pos, fs_info->sectorsize);
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX); ASSERT(block_len <= U32_MAX);
for (i = 0; i < num_pages; i++) { /*
/* page checked is some magic around finding pages that * Folio checked is some magic around finding folios that have been
* have been modified without going through btrfs_set_page_dirty * modified without going through btrfs_dirty_folio(). Clear it here.
* clear it here. There should be no need to mark the pages * There should be no need to mark the pages accessed as
* accessed as prepare_pages should have marked them accessed * prepare_one_folio() should have marked them accessed in
* in prepare_pages via find_or_create_page() * prepare_one_folio() via find_or_create_page()
*/ */
btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
block_start, block_len); folio_unlock(folio);
unlock_page(pages[i]); folio_put(folio);
put_page(pages[i]);
}
} }
/* /*
* After btrfs_copy_from_user(), update the following things for delalloc: * After btrfs_copy_from_user(), update the following things for delalloc:
* - Mark newly dirtied pages as DELALLOC in the io tree. * - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back. * Used to advise which range is to be written back.
* - Mark modified pages as Uptodate/Dirty and not needing COW fixup * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
* - Update inode size for past EOF write * - Update inode size for past EOF write
*/ */
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
size_t num_pages, loff_t pos, size_t write_bytes, size_t write_bytes, struct extent_state **cached, bool noreserve)
struct extent_state **cached, bool noreserve)
{ {
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0; int ret = 0;
int i;
u64 num_bytes; u64 num_bytes;
u64 start_pos; u64 start_pos;
u64 end_of_last_block; u64 end_of_last_block;
@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
num_bytes = round_up(write_bytes + pos - start_pos, num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize); fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX); ASSERT(num_bytes <= U32_MAX);
ASSERT(folio_pos(folio) <= pos &&
folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1; end_of_last_block = start_pos + num_bytes - 1;
@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
if (ret) if (ret)
return ret; return ret;
for (i = 0; i < num_pages; i++) { btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
struct page *p = pages[i]; btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
start_pos, num_bytes);
btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
start_pos, num_bytes);
btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
start_pos, num_bytes);
}
/* /*
* we've only changed i_size in ram, and we haven't updated * we've only changed i_size in ram, and we haven't updated
@ -851,55 +833,49 @@ out:
} }
/* /*
* on error we return an unlocked page and the error value * On error return an unlocked folio and the error value
* on success we return a locked page and 0 * On success return a locked folio and 0
*/ */
static int prepare_uptodate_page(struct inode *inode, static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
struct page *page, u64 pos, u64 len, bool force_uptodate)
bool force_uptodate)
{ {
struct folio *folio = page_folio(page); u64 clamp_start = max_t(u64, pos, folio_pos(folio));
u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
int ret = 0; int ret = 0;
if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && if (folio_test_uptodate(folio))
!PageUptodate(page)) { return 0;
if (!force_uptodate &&
IS_ALIGNED(clamp_start, PAGE_SIZE) &&
IS_ALIGNED(clamp_end, PAGE_SIZE))
return 0;
ret = btrfs_read_folio(NULL, folio); ret = btrfs_read_folio(NULL, folio);
if (ret) if (ret)
return ret; return ret;
lock_page(page); folio_lock(folio);
if (!PageUptodate(page)) { if (!folio_test_uptodate(folio)) {
unlock_page(page); folio_unlock(folio);
return -EIO; return -EIO;
} }
/* /*
* Since btrfs_read_folio() will unlock the folio before it * Since btrfs_read_folio() will unlock the folio before it returns,
* returns, there is a window where btrfs_release_folio() can be * there is a window where btrfs_release_folio() can be called to
* called to release the page. Here we check both inode * release the page. Here we check both inode mapping and page
* mapping and PagePrivate() to make sure the page was not * private to make sure the page was not released.
* released.
* *
* The private flag check is essential for subpage as we need * The private flag check is essential for subpage as we need to store
* to store extra bitmap using folio private. * extra bitmap using folio private.
*/ */
if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page); folio_unlock(folio);
return -EAGAIN; return -EAGAIN;
} }
}
return 0; return 0;
} }
static fgf_t get_prepare_fgp_flags(bool nowait)
{
fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
if (nowait)
fgp_flags |= FGP_NOWAIT;
return fgp_flags;
}
static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
{ {
gfp_t gfp; gfp_t gfp;
@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
} }
/* /*
* this just gets pages into the page cache and locks them down. * Get folio into the page cache and lock it.
*/ */
static noinline int prepare_pages(struct inode *inode, struct page **pages, static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
size_t num_pages, loff_t pos, loff_t pos, size_t write_bytes,
size_t write_bytes, bool force_uptodate, bool force_uptodate, bool nowait)
bool nowait)
{ {
int i;
unsigned long index = pos >> PAGE_SHIFT; unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait); gfp_t mask = get_prepare_gfp_flags(inode, nowait);
fgf_t fgp_flags = get_prepare_fgp_flags(nowait); fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
struct folio *folio;
int ret = 0; int ret = 0;
int faili;
for (i = 0; i < num_pages; i++) {
again: again:
pages[i] = pagecache_get_page(inode->i_mapping, index + i, folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
fgp_flags, mask | __GFP_WRITE); if (IS_ERR(folio)) {
if (!pages[i]) {
faili = i - 1;
if (nowait) if (nowait)
ret = -EAGAIN; ret = -EAGAIN;
else else
ret = -ENOMEM; ret = PTR_ERR(folio);
goto fail; return ret;
} }
/* Only support page sized folio yet. */
ret = set_page_extent_mapped(pages[i]); ASSERT(folio_order(folio) == 0);
ret = set_folio_extent_mapped(folio);
if (ret < 0) { if (ret < 0) {
faili = i; folio_unlock(folio);
goto fail; folio_put(folio);
return ret;
} }
ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
if (i == 0)
ret = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
if (!ret && i == num_pages - 1)
ret = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (ret) { if (ret) {
put_page(pages[i]); /* The folio is already unlocked. */
folio_put(folio);
if (!nowait && ret == -EAGAIN) { if (!nowait && ret == -EAGAIN) {
ret = 0; ret = 0;
goto again; goto again;
} }
faili = i - 1;
goto fail;
}
wait_on_page_writeback(pages[i]);
}
return 0;
fail:
while (faili >= 0) {
unlock_page(pages[faili]);
put_page(pages[faili]);
faili--;
}
return ret; return ret;
}
*folio_ret = folio;
return 0;
} }
/* /*
* This function locks the extent and properly waits for data=ordered extents * Locks the extent and properly waits for data=ordered extents to finish
* to finish before allowing the pages to be modified if need. * before allowing the folios to be modified if need.
* *
* The return value: * Return:
* 1 - the extent is locked * 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK * 0 - the extent is not locked, and everything is OK
* -EAGAIN - need re-prepare the pages * -EAGAIN - need to prepare the folios again
* the other < 0 number - Something wrong happens
*/ */
static noinline int static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
size_t num_pages, loff_t pos, loff_t pos, size_t write_bytes,
size_t write_bytes,
u64 *lockstart, u64 *lockend, bool nowait, u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state) struct extent_state **cached_state)
{ {
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos; u64 start_pos;
u64 last_pos; u64 last_pos;
int i;
int ret = 0; int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize); start_pos = round_down(pos, fs_info->sectorsize);
@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (nowait) { if (nowait) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
cached_state)) { cached_state)) {
for (i = 0; i < num_pages; i++) { folio_unlock(folio);
unlock_page(pages[i]); folio_put(folio);
put_page(pages[i]);
pages[i] = NULL;
}
return -EAGAIN; return -EAGAIN;
} }
} else { } else {
@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset <= last_pos) { ordered->file_offset <= last_pos) {
unlock_extent(&inode->io_tree, start_pos, last_pos, unlock_extent(&inode->io_tree, start_pos, last_pos,
cached_state); cached_state);
for (i = 0; i < num_pages; i++) { folio_unlock(folio);
unlock_page(pages[i]); folio_put(folio);
put_page(pages[i]);
}
btrfs_start_ordered_extent(ordered); btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
return -EAGAIN; return -EAGAIN;
@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
} }
/* /*
* We should be called after prepare_pages() which should have locked * We should be called after prepare_one_folio() which should have locked
* all pages in the range. * all pages in the range.
*/ */
for (i = 0; i < num_pages; i++) WARN_ON(!folio_test_locked(folio));
WARN_ON(!PageLocked(pages[i]));
return ret; return ret;
} }
@ -1120,7 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock); btrfs_drew_write_unlock(&inode->root->snapshot_lock);
} }
int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) int btrfs_write_check(struct kiocb *iocb, size_t count)
{ {
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
@ -1175,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
loff_t pos; loff_t pos;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL; struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0; u64 release_bytes = 0;
u64 lockstart; u64 lockstart;
u64 lockend; u64 lockend;
size_t num_written = 0; size_t num_written = 0;
int nrptrs;
ssize_t ret; ssize_t ret;
bool only_release_metadata = false;
bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode); loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0; unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
bool only_release_metadata = false;
if (nowait) if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY; ilock_flags |= BTRFS_ILOCK_TRY;
@ -1201,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret <= 0) if (ret <= 0)
goto out; goto out;
ret = btrfs_write_check(iocb, i, ret); ret = btrfs_write_check(iocb, ret);
if (ret < 0) if (ret < 0)
goto out; goto out;
pos = iocb->ki_pos; pos = iocb->ki_pos;
nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
while (iov_iter_count(i) > 0) { while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL; struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos); size_t offset = offset_in_page(pos);
size_t sector_offset; size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i), size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
nrptrs * (size_t)PAGE_SIZE -
offset);
size_t num_pages;
size_t reserve_bytes; size_t reserve_bytes;
size_t dirty_pages;
size_t copied; size_t copied;
size_t dirty_sectors; size_t dirty_sectors;
size_t num_sectors; size_t num_sectors;
struct folio *folio = NULL;
int extents_locked; int extents_locked;
bool force_page_uptodate = false;
/* /*
* Fault pages before locking them in prepare_pages * Fault pages before locking them in prepare_one_folio()
* to avoid recursive lock * to avoid recursive lock
*/ */
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@ -1271,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
only_release_metadata = true; only_release_metadata = true;
} }
num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
WARN_ON(num_pages > nrptrs);
reserve_bytes = round_up(write_bytes + sector_offset, reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize); fs_info->sectorsize);
WARN_ON(reserve_bytes == 0); WARN_ON(reserve_bytes == 0);
@ -1300,22 +1230,16 @@ again:
break; break;
} }
/* ret = prepare_one_folio(inode, &folio, pos, write_bytes,
* This is going to setup the pages array with the number of force_page_uptodate, false);
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
ret = prepare_pages(inode, pages, num_pages,
pos, write_bytes, force_page_uptodate, false);
if (ret) { if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode), btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes); reserve_bytes);
break; break;
} }
extents_locked = lock_and_cleanup_extent_if_need( extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
BTRFS_I(inode), pages, folio, pos, write_bytes, &lockstart,
num_pages, pos, write_bytes, &lockstart,
&lockend, nowait, &cached_state); &lockend, nowait, &cached_state);
if (extents_locked < 0) { if (extents_locked < 0) {
if (!nowait && extents_locked == -EAGAIN) if (!nowait && extents_locked == -EAGAIN)
@ -1327,28 +1251,18 @@ again:
break; break;
} }
copied = btrfs_copy_from_user(pos, write_bytes, pages, i); copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset, dirty_sectors = round_up(copied + sector_offset,
fs_info->sectorsize); fs_info->sectorsize);
dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
/*
* if we have trouble faulting in the pages, fall
* back to one page at a time
*/
if (copied < write_bytes)
nrptrs = 1;
if (copied == 0) { if (copied == 0) {
force_page_uptodate = true; force_page_uptodate = true;
dirty_sectors = 0; dirty_sectors = 0;
dirty_pages = 0;
} else { } else {
force_page_uptodate = false; force_page_uptodate = false;
dirty_pages = DIV_ROUND_UP(copied + offset,
PAGE_SIZE);
} }
if (num_sectors > dirty_sectors) { if (num_sectors > dirty_sectors) {
@ -1358,13 +1272,10 @@ again:
btrfs_delalloc_release_metadata(BTRFS_I(inode), btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true); release_bytes, true);
} else { } else {
u64 __pos; u64 release_start = round_up(pos + copied,
fs_info->sectorsize);
__pos = round_down(pos,
fs_info->sectorsize) +
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(BTRFS_I(inode), btrfs_delalloc_release_space(BTRFS_I(inode),
data_reserved, __pos, data_reserved, release_start,
release_bytes, true); release_bytes, true);
} }
} }
@ -1372,15 +1283,14 @@ again:
release_bytes = round_up(copied + sector_offset, release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize); fs_info->sectorsize);
ret = btrfs_dirty_pages(BTRFS_I(inode), pages, ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
dirty_pages, pos, copied,
&cached_state, only_release_metadata); &cached_state, only_release_metadata);
/* /*
* If we have not locked the extent range, because the range's * If we have not locked the extent range, because the range's
* start offset is >= i_size, we might still have a non-NULL * start offset is >= i_size, we might still have a non-NULL
* cached extent state, acquired while marking the extent range * cached extent state, acquired while marking the extent range
* as delalloc through btrfs_dirty_pages(). Therefore free any * as delalloc through btrfs_dirty_page(). Therefore free any
* possible cached extent state to avoid a memory leak. * possible cached extent state to avoid a memory leak.
*/ */
if (extents_locked) if (extents_locked)
@ -1391,7 +1301,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) { if (ret) {
btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); btrfs_drop_folio(fs_info, folio, pos, copied);
break; break;
} }
@ -1399,7 +1309,7 @@ again:
if (only_release_metadata) if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode)); btrfs_check_nocow_unlock(BTRFS_I(inode));
btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); btrfs_drop_folio(fs_info, folio, pos, copied);
cond_resched(); cond_resched();
@ -1407,8 +1317,6 @@ again:
num_written += copied; num_written += copied;
} }
kfree(pages);
if (release_bytes) { if (release_bytes) {
if (only_release_metadata) { if (only_release_metadata) {
btrfs_check_nocow_unlock(BTRFS_I(inode)); btrfs_check_nocow_unlock(BTRFS_I(inode));
@ -1453,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (ret || encoded->len == 0) if (ret || encoded->len == 0)
goto out; goto out;
ret = btrfs_write_check(iocb, from, encoded->len); ret = btrfs_write_check(iocb, encoded->len);
if (ret < 0) if (ret < 0)
goto out; goto out;
@ -3785,6 +3693,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl, .compat_ioctl = btrfs_compat_ioctl,
#endif #endif
.remap_file_range = btrfs_remap_file_range, .remap_file_range = btrfs_remap_file_range,
.uring_cmd = btrfs_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
}; };

View File

@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded); const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
size_t num_pages, loff_t pos, size_t write_bytes, size_t write_bytes, struct extent_state **cached, bool noreserve);
struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end); int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait); size_t *write_bytes, bool nowait);
@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state, struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret); u64 *delalloc_start_ret, u64 *delalloc_end_ret);
int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count); int btrfs_write_check(struct kiocb *iocb, size_t count);
ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i); ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif #endif

View File

@ -11,6 +11,7 @@
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/error-injection.h> #include <linux/error-injection.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/string_choices.h>
#include "ctree.h" #include "ctree.h"
#include "fs.h" #include "fs.h"
#include "messages.h" #include "messages.h"
@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
int bitmaps = 0; int bitmaps = 0;
int ret; int ret;
int must_iput = 0; int must_iput = 0;
int i_size;
if (!i_size_read(inode)) if (!i_size_read(inode))
return -EIO; return -EIO;
@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl); io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */ /* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, i_size = i_size_read(inode);
io_ctl->num_pages, 0, i_size_read(inode), for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) {
&cached_state, false); u64 dirty_start = i * PAGE_SIZE;
if (ret) u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]),
dirty_start, dirty_len, &cached_state, false);
if (ret < 0)
goto out_nospc; goto out_nospc;
}
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem); up_write(&block_group->data_rwsem);
@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (info->bytes >= bytes && !block_group->ro) if (info->bytes >= bytes && !block_group->ro)
count++; count++;
btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
info->offset, info->bytes, info->offset, info->bytes, str_yes_no(info->bitmap));
(info->bitmap) ? "yes" : "no");
} }
spin_unlock(&ctl->tree_lock); spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s", btrfs_info(fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes"); str_no_yes(list_empty(&block_group->cluster_list)));
btrfs_info(fs_info, btrfs_info(fs_info,
"%d free space entries at or bigger than %llu bytes", "%d free space entries at or bigger than %llu bytes",
count, bytes); count, bytes);

View File

@ -263,10 +263,10 @@ enum {
BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_ZONED | \
BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
/* /*
* Features under developmen like Extent tree v2 support is enabled * Features under developmen like Extent tree v2 support is enabled
* only under CONFIG_BTRFS_DEBUG. * only under CONFIG_BTRFS_EXPERIMENTAL
*/ */
#define BTRFS_FEATURE_INCOMPAT_SUPP \ #define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
@ -317,6 +317,8 @@ struct btrfs_dev_replace {
struct percpu_counter bio_counter; struct percpu_counter bio_counter;
wait_queue_head_t replace_wait; wait_queue_head_t replace_wait;
struct task_struct *replace_task;
}; };
/* /*
@ -633,9 +635,10 @@ struct btrfs_fs_info {
s32 delalloc_batch; s32 delalloc_batch;
struct percpu_counter evictable_extent_maps; struct percpu_counter evictable_extent_maps;
spinlock_t extent_map_shrinker_lock; u64 em_shrinker_last_root;
u64 extent_map_shrinker_last_root; u64 em_shrinker_last_ino;
u64 extent_map_shrinker_last_ino; atomic64_t em_shrinker_nr_to_scan;
struct work_struct em_shrinker_work;
/* Protected by 'trans_lock'. */ /* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots; struct list_head dirty_cowonly_roots;
@ -876,12 +879,9 @@ struct btrfs_fs_info {
#endif #endif
}; };
#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
struct page *: (_page))->mapping->host))
#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
struct folio *: (_folio))->mapping->host)) struct folio *: (_folio))->mapping->host))
#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \

View File

@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
index++; index++;
continue; continue;
} }
folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0); folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++; index++;
if (IS_ERR(folio)) if (IS_ERR(folio))
continue; continue;
@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else { } else {
struct folio *folio; struct folio *folio;
folio = __filemap_get_folio(inode->vfs_inode.i_mapping, folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
0, 0, 0);
ASSERT(!IS_ERR(folio)); ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0); btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_local_folio(folio, 0); kaddr = kmap_local_folio(folio, 0);
@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
* If being used directly, you must have already checked we're allowed to cow * If being used directly, you must have already checked we're allowed to cow
* the range by getting true from can_cow_file_range_inline(). * the range by getting true from can_cow_file_range_inline().
*/ */
static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size, u64 size, size_t compressed_size,
int compress_type, int compress_type,
struct folio *compressed_folio, struct folio *compressed_folio,
@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return 1; return 1;
lock_extent(&inode->io_tree, offset, end, &cached); lock_extent(&inode->io_tree, offset, end, &cached);
ret = __cow_file_range_inline(inode, offset, size, compressed_size, ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio, compress_type, compressed_folio,
update_i_size); update_i_size);
if (ret > 0) { if (ret > 0) {
@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0; return 0;
} }
/* /*
* Special check for subpage. * Only enable sector perfect compression for experimental builds.
* *
* We lock the full page then run each delalloc range in the page, thus * This is a big feature change for subpage cases, and can hit
* for the following case, we will hit some subpage specific corner case: * different corner cases, so only limit this feature for
* experimental build for now.
* *
* 0 32K 64K * ETA for moving this out of experimental builds is 6.15.
* | |///////| |///////|
* \- A \- B
*
* In above case, both range A and range B will try to unlock the full
* page [0, 64K), causing the one finished later will have page
* unlocked already, triggering various page lock requirement BUG_ON()s.
*
* So here we add an artificial limit that subpage compression can only
* if the range is fully page aligned.
*
* In theory we only need to ensure the first page is fully covered, but
* the tailing partial page will be locked until the full compression
* finishes, delaying the write of other range.
*
* TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
* first to prevent any submitted async extent to unlock the full page.
* By this, we can ensure for subpage case that only the last async_cow
* will unlock the full page.
*/ */
if (fs_info->sectorsize < PAGE_SIZE) { if (fs_info->sectorsize < PAGE_SIZE &&
!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
if (!PAGE_ALIGNED(start) || if (!PAGE_ALIGNED(start) ||
!PAGE_ALIGNED(end + 1)) !PAGE_ALIGNED(end + 1))
return 0; return 0;
@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT; for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) { index <= end_index; index++) {
folio = __filemap_get_folio(inode->i_mapping, index, 0, 0); folio = filemap_get_folio(inode->i_mapping, index);
if (IS_ERR(folio)) { if (IS_ERR(folio)) {
if (!ret) if (!ret)
ret = PTR_ERR(folio); ret = PTR_ERR(folio);
continue; continue;
} }
folio_clear_dirty_for_io(folio); btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
end + 1 - start);
folio_put(folio); folio_put(folio);
} }
return ret; return ret;
@ -1001,17 +985,6 @@ again:
(start > 0 || end + 1 < inode->disk_i_size)) (start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed; goto cleanup_and_bail_uncompressed;
/*
* For subpage case, we require full page alignment for the sector
* aligned range.
* Thus we must also check against @actual_end, not just @end.
*/
if (blocksize < PAGE_SIZE) {
if (!PAGE_ALIGNED(start) ||
!PAGE_ALIGNED(round_up(actual_end, blocksize)))
goto cleanup_and_bail_uncompressed;
}
total_compressed = min_t(unsigned long, total_compressed, total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED); BTRFS_MAX_UNCOMPRESSED);
total_in = 0; total_in = 0;
@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
u64 alloc_hint = 0; u64 alloc_hint = 0;
u64 orig_start = start; u64 orig_start = start;
u64 num_bytes; u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0; u64 cur_alloc_size = 0;
u64 min_alloc_size; u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize; u64 blocksize = fs_info->sectorsize;
@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct extent_map *em; struct extent_map *em;
unsigned clear_bits; unsigned clear_bits;
unsigned long page_ops; unsigned long page_ops;
bool extent_reserved = false;
int ret = 0; int ret = 0;
if (btrfs_is_free_space_inode(inode)) { if (btrfs_is_free_space_inode(inode)) {
@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent; struct btrfs_file_extent file_extent;
cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
min_alloc_size, 0, alloc_hint, min_alloc_size, 0, alloc_hint,
&ins, 1, 1); &ins, 1, 1);
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret < 0) if (ret < 0)
goto out_unlock; goto out_unlock;
cur_alloc_size = ins.offset; cur_alloc_size = ins.offset;
extent_reserved = true;
ram_size = ins.offset;
file_extent.disk_bytenr = ins.objectid; file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset; file_extent.disk_num_bytes = ins.offset;
file_extent.num_bytes = ins.offset; file_extent.num_bytes = ins.offset;
@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0; file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE; file_extent.compression = BTRFS_COMPRESS_NONE;
lock_extent(&inode->io_tree, start, start + ram_size - 1, lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached); &cached);
em = btrfs_create_io_em(inode, start, &file_extent, em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR); BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) { if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start, unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached); start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em); ret = PTR_ERR(em);
goto out_reserve; goto out_reserve;
} }
@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_REGULAR); 1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) { if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start, unlock_extent(&inode->io_tree, start,
start + ram_size - 1, &cached); start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered); ret = PTR_ERR(ordered);
goto out_drop_extent_cache; goto out_drop_extent_cache;
} }
@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*/ */
if (ret) if (ret)
btrfs_drop_extent_map_range(inode, start, btrfs_drop_extent_map_range(inode, start,
start + ram_size - 1, start + cur_alloc_size - 1,
false); false);
} }
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
@ -1519,7 +1487,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED; page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
locked_folio, &cached, locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC, EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops); page_ops);
@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset; alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size; start += cur_alloc_size;
extent_reserved = false; cur_alloc_size = 0;
/* /*
* btrfs_reloc_clone_csums() error, since start is increased * btrfs_reloc_clone_csums() error, since start is increased
@ -1545,7 +1513,7 @@ done:
return ret; return ret;
out_drop_extent_cache: out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve: out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@ -1599,13 +1567,12 @@ out_unlock:
* to decrement again the data space_info's bytes_may_use counter, * to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/ */
if (extent_reserved) { if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start, extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1, start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits, locked_folio, &cached, clear_bits,
page_ops); page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
start += cur_alloc_size;
} }
/* /*
@ -1614,11 +1581,13 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in * space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space(). * btrfs_check_data_free_space().
*/ */
if (start < end) { if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV; clear_bits |= EXTENT_CLEAR_DATA_RESV;
extent_clear_unlock_delalloc(inode, start, end, locked_folio, extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
end, locked_folio,
&cached, clear_bits, page_ops); &cached, clear_bits, page_ops);
btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
} }
return ret; return ret;
} }
@ -3094,10 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out; goto out;
} }
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
btrfs_inode_safe_disk_i_size_write(inode, 0);
if (freespace_inode) if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root); trans = btrfs_join_transaction_spacecache(root);
else else
@ -3107,37 +3072,36 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans = NULL; trans = NULL;
goto out; goto out;
} }
trans->block_rsv = &inode->block_rsv; trans->block_rsv = &inode->block_rsv;
ret = btrfs_update_inode_fallback(trans, inode);
if (ret) /* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
ret = btrfs_insert_raid_extent(trans, ordered_extent); ret = btrfs_insert_raid_extent(trans, ordered_extent);
if (ret) if (ret) {
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);
goto out;
}
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
/* Logic error */
ASSERT(list_empty(&ordered_extent->list));
if (!list_empty(&ordered_extent->list)) {
ret = -EINVAL;
btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
if (ret) {
/* -ENOMEM or corruption */
btrfs_abort_transaction(trans, ret);
}
goto out; goto out;
} }
clear_bits |= EXTENT_LOCKED; clear_bits |= EXTENT_LOCKED;
lock_extent(io_tree, start, end, &cached_state); lock_extent(io_tree, start, end, &cached_state);
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
if (ret)
goto out;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type; compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
return 0; return 0;
} }
static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
{
struct btrfs_root *root = inode->root;
struct btrfs_inode *existing;
const u64 ino = btrfs_ino(inode);
int ret;
if (inode_unhashed(&inode->vfs_inode))
return 0;
if (prealloc) {
ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
if (ret)
return ret;
}
existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
if (xa_is_err(existing)) {
ret = xa_err(existing);
ASSERT(ret != -EINVAL);
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
}
return 0;
}
/* /*
* read an inode from the btree into the in-memory inode * Read a locked inode from the btree into the in-memory inode and add it to
* its root list/tree.
*
* On failure clean up the inode.
*/ */
static int btrfs_read_locked_inode(struct inode *inode, static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
struct btrfs_path *in_path)
{ {
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct btrfs_path *path = in_path;
struct extent_buffer *leaf; struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item; struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode,
ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
if (ret) if (ret)
return ret; goto out;
ret = btrfs_fill_inode(inode, &rdev); ret = btrfs_fill_inode(inode, &rdev);
if (!ret) if (!ret)
filled = true; filled = true;
if (!path) { ASSERT(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
}
btrfs_get_inode_key(BTRFS_I(inode), &location); btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0); ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) { if (ret) {
if (path != in_path) /*
btrfs_free_path(path); * ret > 0 can come from btrfs_search_slot called by
return ret; * btrfs_lookup_inode(), this means the inode was not found.
*/
if (ret > 0)
ret = -ENOENT;
goto out;
} }
leaf = path->nodes[0]; leaf = path->nodes[0];
@ -3965,8 +3960,6 @@ cache_acl:
btrfs_ino(BTRFS_I(inode)), btrfs_ino(BTRFS_I(inode)),
btrfs_root_id(root), ret); btrfs_root_id(root), ret);
} }
if (path != in_path)
btrfs_free_path(path);
if (!maybe_acls) if (!maybe_acls)
cache_no_acl(inode); cache_no_acl(inode);
@ -3993,7 +3986,15 @@ cache_acl:
} }
btrfs_sync_inode_flags_to_i_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
if (ret)
goto out;
return 0; return 0;
out:
iget_failed(inode);
return ret;
} }
/* /*
@ -5502,35 +5503,7 @@ out:
return err; return err;
} }
static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
{
struct btrfs_root *root = inode->root;
struct btrfs_inode *existing;
const u64 ino = btrfs_ino(inode);
int ret;
if (inode_unhashed(&inode->vfs_inode))
return 0;
if (prealloc) {
ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
if (ret)
return ret;
}
existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
if (xa_is_err(existing)) {
ret = xa_err(existing);
ASSERT(ret != -EINVAL);
ASSERT(ret != -ENOMEM);
return ret;
} else if (existing) {
WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
}
return 0;
}
static void btrfs_del_inode_from_root(struct btrfs_inode *inode) static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{ {
@ -5592,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
} }
/* /*
* Get an inode object given its inode number and corresponding root. * Get an inode object given its inode number and corresponding root. Path is
* Path can be preallocated to prevent recursing back to iget through * preallocated to prevent recursing back to iget through allocator.
* allocator. NULL is also valid but may require an additional allocation
* later.
*/ */
struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path) struct btrfs_path *path)
@ -5611,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
return inode; return inode;
ret = btrfs_read_locked_inode(inode, path); ret = btrfs_read_locked_inode(inode, path);
/* if (ret)
* ret > 0 can come from btrfs_search_slot called by return ERR_PTR(ret);
* btrfs_read_locked_inode(), this means the inode item was not found.
*/
if (ret > 0)
ret = -ENOENT;
if (ret < 0)
goto error;
ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
if (ret < 0)
goto error;
unlock_new_inode(inode); unlock_new_inode(inode);
return inode; return inode;
error:
iget_failed(inode);
return ERR_PTR(ret);
} }
/*
* Get an inode object given its inode number and corresponding root.
*/
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{ {
return btrfs_iget_path(ino, root, NULL); struct inode *inode;
struct btrfs_path *path;
int ret;
inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
ret = btrfs_read_locked_inode(inode, path);
btrfs_free_path(path);
if (ret)
return ERR_PTR(ret);
unlock_new_inode(inode);
return inode;
} }
static struct inode *new_simple_dir(struct inode *dir, static struct inode *new_simple_dir(struct inode *dir,
@ -6023,7 +6004,7 @@ again:
* offset. This means that new entries created during readdir * offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir. * are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as * This has broken buggy programs which operate on names as
* they're returned by readdir. Until we re-use freed offsets * they're returned by readdir. Until we reuse freed offsets
* we have this hack to stop new entries from being returned * we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge * under the assumption that they'll never reach this huge
* offset. * offset.
@ -6765,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
return ret; return ret;
} }
static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
struct folio *folio)
{ {
struct btrfs_file_extent_item *fi; struct btrfs_file_extent_item *fi;
void *kaddr; void *kaddr;
@ -6964,7 +6944,7 @@ next:
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE); ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize); ASSERT(em->len == fs_info->sectorsize);
ret = read_inline_extent(inode, path, folio); ret = read_inline_extent(path, folio);
if (ret < 0) if (ret < 0)
goto out; goto out;
goto insert; goto insert;
@ -8972,28 +8952,6 @@ out_inode:
return finish_open_simple(file, ret); return finish_open_simple(file, ret);
} }
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
struct folio *folio;
u32 len;
ASSERT(end + 1 - start <= U32_MAX);
len = end + 1 - start;
while (index <= end_index) {
folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
/* This is for data, which doesn't yet support larger folio. */
ASSERT(folio_order(folio) == 0);
btrfs_folio_set_writeback(fs_info, folio, start, len);
folio_put(folio);
index++;
}
}
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type) int compress_type)
{ {
@ -9038,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline(
unsigned long ptr; unsigned long ptr;
void *tmp; void *tmp;
ssize_t ret; ssize_t ret;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path(); path = btrfs_alloc_path();
if (!path) { if (!path) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
path->nowait = nowait;
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0); extent_start, 0);
if (ret) { if (ret) {
@ -9107,6 +9069,7 @@ out:
struct btrfs_encoded_read_private { struct btrfs_encoded_read_private {
wait_queue_head_t wait; wait_queue_head_t wait;
void *uring_ctx;
atomic_t pending; atomic_t pending;
blk_status_t status; blk_status_t status;
}; };
@ -9126,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/ */
WRITE_ONCE(priv->status, bbio->bio.bi_status); WRITE_ONCE(priv->status, bbio->bio.bi_status);
} }
if (!atomic_dec_return(&priv->pending)) if (atomic_dec_return(&priv->pending) == 0) {
int err = blk_status_to_errno(READ_ONCE(priv->status));
if (priv->uring_ctx) {
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
wake_up(&priv->wait); wake_up(&priv->wait);
}
}
bio_put(&bbio->bio); bio_put(&bbio->bio);
} }
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr, u64 disk_bytenr, u64 disk_io_size,
u64 disk_io_size, struct page **pages) struct page **pages, void *uring_ctx)
{ {
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_encoded_read_private priv = { struct btrfs_encoded_read_private *priv;
.pending = ATOMIC_INIT(1),
};
unsigned long i = 0; unsigned long i = 0;
struct btrfs_bio *bbio; struct btrfs_bio *bbio;
int ret;
init_waitqueue_head(&priv.wait); priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
if (!priv)
return -ENOMEM;
init_waitqueue_head(&priv->wait);
atomic_set(&priv->pending, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
btrfs_encoded_read_endio, &priv); btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode; bbio->inode = inode;
@ -9153,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
atomic_inc(&priv.pending); atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0); btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
btrfs_encoded_read_endio, &priv); btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode; bbio->inode = inode;
continue; continue;
@ -9168,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes; disk_io_size -= bytes;
} while (disk_io_size); } while (disk_io_size);
atomic_inc(&priv.pending); atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0); btrfs_submit_bbio(bbio, 0);
if (atomic_dec_return(&priv.pending)) if (uring_ctx) {
io_wait_event(priv.wait, !atomic_read(&priv.pending)); if (atomic_dec_return(&priv->pending) == 0) {
ret = blk_status_to_errno(READ_ONCE(priv->status));
btrfs_uring_read_extent_endio(uring_ctx, ret);
kfree(priv);
return ret;
}
return -EIOCBQUEUED;
} else {
if (atomic_dec_return(&priv->pending) != 0)
io_wait_event(priv->wait, !atomic_read(&priv->pending));
/* See btrfs_encoded_read_endio() for ordering. */ /* See btrfs_encoded_read_endio() for ordering. */
return blk_status_to_errno(READ_ONCE(priv.status)); ret = blk_status_to_errno(READ_ONCE(priv->status));
kfree(priv);
return ret;
}
} }
static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
struct iov_iter *iter,
u64 start, u64 lockend, u64 start, u64 lockend,
struct extent_state **cached_state, struct extent_state **cached_state,
u64 disk_bytenr, u64 disk_io_size, u64 disk_bytenr, u64 disk_io_size,
size_t count, bool compressed, size_t count, bool compressed, bool *unlocked)
bool *unlocked)
{ {
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree; struct extent_io_tree *io_tree = &inode->io_tree;
@ -9203,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out; goto out;
} }
ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
disk_io_size, pages); disk_io_size, pages, NULL);
if (ret) if (ret)
goto out; goto out;
@ -9244,21 +9232,26 @@ out:
} }
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded) struct btrfs_ioctl_encoded_io_args *encoded,
struct extent_state **cached_state,
u64 *disk_bytenr, u64 *disk_io_size)
{ {
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree; struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret; ssize_t ret;
size_t count = iov_iter_count(iter); size_t count = iov_iter_count(iter);
u64 start, lockend, disk_bytenr, disk_io_size; u64 start, lockend;
struct extent_state *cached_state = NULL;
struct extent_map *em; struct extent_map *em;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
bool unlocked = false; bool unlocked = false;
file_accessed(iocb->ki_filp); file_accessed(iocb->ki_filp);
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); ret = btrfs_inode_lock(inode,
BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
if (ret)
return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) { if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@ -9271,6 +9264,29 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/ */
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
if (nowait) {
struct btrfs_ordered_extent *ordered;
if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
start, lockend)) {
ret = -EAGAIN;
goto out_unlock_inode;
}
if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
ret = -EAGAIN;
goto out_unlock_inode;
}
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (ordered) {
btrfs_put_ordered_extent(ordered);
unlock_extent(io_tree, start, lockend, cached_state);
ret = -EAGAIN;
goto out_unlock_inode;
}
} else {
for (;;) { for (;;) {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
@ -9278,15 +9294,17 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1); lockend - start + 1);
if (ret) if (ret)
goto out_unlock_inode; goto out_unlock_inode;
lock_extent(io_tree, start, lockend, &cached_state);
lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1); lockend - start + 1);
if (!ordered) if (!ordered)
break; break;
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
unlock_extent(io_tree, start, lockend, &cached_state); unlock_extent(io_tree, start, lockend, cached_state);
cond_resched(); cond_resched();
} }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1); em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
if (IS_ERR(em)) { if (IS_ERR(em)) {
@ -9304,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em); free_extent_map(em);
em = NULL; em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
&cached_state, extent_start, cached_state, extent_start,
count, encoded, &unlocked); count, encoded, &unlocked);
goto out; goto out_unlock_extent;
} }
/* /*
@ -9317,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
inode->vfs_inode.i_size) - iocb->ki_pos; inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE || if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) { (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE; *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len); count = min_t(u64, count, encoded->len);
encoded->len = count; encoded->len = count;
encoded->unencoded_len = count; encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) { } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->disk_bytenr; *disk_bytenr = em->disk_bytenr;
/* /*
* Bail if the buffer isn't large enough to return the whole * Bail if the buffer isn't large enough to return the whole
* compressed extent. * compressed extent.
@ -9331,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS; ret = -ENOBUFS;
goto out_em; goto out_em;
} }
disk_io_size = em->disk_num_bytes; *disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes; count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes; encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
@ -9341,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_em; goto out_em;
encoded->compression = ret; encoded->compression = ret;
} else { } else {
disk_bytenr = extent_map_block_start(em) + (start - em->start); *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count) if (encoded->len > count)
encoded->len = count; encoded->len = count;
/* /*
* Don't read beyond what we locked. This also limits the page * Don't read beyond what we locked. This also limits the page
* allocations that we'll do. * allocations that we'll do.
*/ */
disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
count = start + disk_io_size - iocb->ki_pos; count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count; encoded->len = count;
encoded->unencoded_len = count; encoded->unencoded_len = count;
disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
} }
free_extent_map(em); free_extent_map(em);
em = NULL; em = NULL;
if (disk_bytenr == EXTENT_MAP_HOLE) { if (*disk_bytenr == EXTENT_MAP_HOLE) {
unlock_extent(io_tree, start, lockend, &cached_state); unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true; unlocked = true;
ret = iov_iter_zero(count, iter); ret = iov_iter_zero(count, iter);
if (ret != count) if (ret != count)
ret = -EFAULT; ret = -EFAULT;
} else { } else {
ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, ret = -EIOCBQUEUED;
&cached_state, disk_bytenr, goto out_unlock_extent;
disk_io_size, count,
encoded->compression,
&unlocked);
} }
out:
if (ret >= 0)
iocb->ki_pos += encoded->len;
out_em: out_em:
free_extent_map(em); free_extent_map(em);
out_unlock_extent: out_unlock_extent:
if (!unlocked) /* Leave inode and extent locked if we need to do a read. */
unlock_extent(io_tree, start, lockend, &cached_state); if (!unlocked && ret != -EIOCBQUEUED)
unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode: out_unlock_inode:
if (!unlocked) if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret; return ret;
} }
@ -9492,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
*/ */
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
if (!folios) if (!folios)
return -ENOMEM; return -ENOMEM;
for (i = 0; i < nr_folios; i++) { for (i = 0; i < nr_folios; i++) {
@ -9556,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->unencoded_len == encoded->len && if (encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0 && encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
ret = __cow_file_range_inline(inode, start, encoded->len, ret = __cow_file_range_inline(inode, encoded->len,
orig_count, compression, folios[0], orig_count, compression, folios[0],
true); true);
if (ret <= 0) { if (ret <= 0) {

View File

@ -29,6 +29,7 @@
#include <linux/fileattr.h> #include <linux/fileattr.h>
#include <linux/fsverity.h> #include <linux/fsverity.h>
#include <linux/sched/xacct.h> #include <linux/sched/xacct.h>
#include <linux/io_uring/cmd.h>
#include "ctree.h" #include "ctree.h"
#include "disk-io.h" #include "disk-io.h"
#include "export.h" #include "export.h"
@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
struct btrfs_qgroup_inherit *inherit) struct btrfs_qgroup_inherit *inherit)
{ {
int ret; int ret;
bool snapshot_force_cow = false;
/* /*
* Force new buffered writes to reserve space even when NOCOW is * Force new buffered writes to reserve space even when NOCOW is
@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
* creation. * creation.
*/ */
atomic_inc(&root->snapshot_force_cow); atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, NULL); btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen, ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit); root, readonly, inherit);
out:
if (snapshot_force_cow)
atomic_dec(&root->snapshot_force_cow); atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock); btrfs_drew_read_unlock(&root->snapshot_lock);
return ret; return ret;
} }
@ -4057,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
return 0; return 0;
} }
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
void __user *arg)
{ {
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EPERM; return -EPERM;
@ -4513,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags); flags);
size_t copy_end; size_t copy_end;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV]; struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack; struct iovec *iov = iovstack;
struct iov_iter iter; struct iov_iter iter;
loff_t pos; loff_t pos;
struct kiocb kiocb; struct kiocb kiocb;
ssize_t ret; ssize_t ret;
u64 disk_bytenr, disk_io_size;
struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) { if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM; ret = -EPERM;
@ -4571,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file); init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos; kiocb.ki_pos = pos;
ret = btrfs_encoded_read(&kiocb, &iter, &args); ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
&disk_bytenr, &disk_io_size);
if (ret == -EIOCBQUEUED) {
bool unlocked = false;
u64 start, lockend, count;
start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
if (args.compression)
count = disk_io_size;
else
count = args.len;
ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
&cached_state, disk_bytenr,
disk_io_size, count,
args.compression, &unlocked);
if (!unlocked) {
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
}
if (ret >= 0) { if (ret >= 0) {
fsnotify_access(file); fsnotify_access(file);
if (copy_to_user(argp + copy_end, if (copy_to_user(argp + copy_end,
@ -4689,6 +4716,439 @@ out_acct:
return ret; return ret;
} }
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
* off and cleanup the I/O in btrfs_uring_read_finished.
*/
struct btrfs_uring_priv {
struct io_uring_cmd *cmd;
struct page **pages;
unsigned long nr_pages;
struct kiocb iocb;
struct iovec *iov;
struct iov_iter iter;
struct extent_state *cached_state;
u64 count;
u64 start;
u64 lockend;
int err;
bool compressed;
};
struct io_btrfs_cmd {
struct btrfs_uring_priv *priv;
};
static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
struct btrfs_uring_priv *priv = bc->priv;
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
unsigned long index;
u64 cur;
size_t page_offset;
ssize_t ret;
if (priv->err) {
ret = priv->err;
goto out;
}
if (priv->compressed) {
index = 0;
page_offset = 0;
} else {
index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
}
cur = 0;
while (cur < priv->count) {
size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
&priv->iter) != bytes) {
ret = -EFAULT;
goto out;
}
index++;
cur += bytes;
page_offset = 0;
}
ret = priv->count;
out:
unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
io_uring_cmd_done(cmd, ret, 0, issue_flags);
add_rchar(current, ret);
for (index = 0; index < priv->nr_pages; index++)
__free_page(priv->pages[index]);
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
{
struct btrfs_uring_priv *priv = ctx;
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd);
priv->err = err;
bc->priv = priv;
io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
}
static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
u64 start, u64 lockend,
struct extent_state *cached_state,
u64 disk_bytenr, u64 disk_io_size,
size_t count, bool compressed,
struct iovec *iov, struct io_uring_cmd *cmd)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
struct page **pages;
struct btrfs_uring_priv *priv = NULL;
unsigned long nr_pages;
int ret;
nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out_fail;
}
priv = kmalloc(sizeof(*priv), GFP_NOFS);
if (!priv) {
ret = -ENOMEM;
goto out_fail;
}
priv->iocb = *iocb;
priv->iov = iov;
priv->iter = *iter;
priv->count = count;
priv->cmd = cmd;
priv->cached_state = cached_state;
priv->compressed = compressed;
priv->nr_pages = nr_pages;
priv->pages = pages;
priv->start = start;
priv->lockend = lockend;
priv->err = 0;
ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
disk_io_size, pages, priv);
if (ret && ret != -EIOCBQUEUED)
goto out_fail;
/*
* If we return -EIOCBQUEUED, we're deferring the cleanup to
* btrfs_uring_read_finished(), which will handle unlocking the extent
* and inode and freeing the allocations.
*/
return -EIOCBQUEUED;
out_fail:
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
kfree(priv);
return ret;
}
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
size_t copy_end;
struct btrfs_ioctl_encoded_io_args args = { 0 };
int ret;
u64 disk_bytenr, disk_io_size;
struct file *file;
struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info;
struct extent_io_tree *io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out_acct;
}
file = cmd->file;
inode = BTRFS_I(file->f_inode);
fs_info = inode->root->fs_info;
io_tree = &inode->io_tree;
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_encoded_io_args_32 args32;
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
if (copy_from_user(&args32, sqe_addr, copy_end)) {
ret = -EFAULT;
goto out_acct;
}
args.iov = compat_ptr(args32.iov);
args.iovcnt = args32.iovcnt;
args.offset = args32.offset;
args.flags = args32.flags;
#else
return -ENOTTY;
#endif
} else {
copy_end = copy_end_kernel;
if (copy_from_user(&args, sqe_addr, copy_end)) {
ret = -EFAULT;
goto out_acct;
}
}
if (args.flags != 0)
return -EINVAL;
ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
&iov, &iter);
if (ret < 0)
goto out_acct;
if (iov_iter_count(&iter) == 0) {
ret = 0;
goto out_free;
}
pos = args.offset;
ret = rw_verify_area(READ, file, &pos, args.len);
if (ret < 0)
goto out_free;
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
if (issue_flags & IO_URING_F_NONBLOCK)
kiocb.ki_flags |= IOCB_NOWAIT;
start = ALIGN_DOWN(pos, fs_info->sectorsize);
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
&disk_bytenr, &disk_io_size);
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
file_accessed(file);
if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
sizeof(args) - copy_end_kernel)) {
if (ret == -EIOCBQUEUED) {
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
ret = -EFAULT;
goto out_free;
}
if (ret == -EIOCBQUEUED) {
u64 count;
/*
* If we've optimized things by storing the iovecs on the stack,
* undo this.
*/
if (!iov) {
iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
if (!iov) {
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
ret = -ENOMEM;
goto out_acct;
}
memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
}
count = min_t(u64, iov_iter_count(&iter), disk_io_size);
/* Match ioctl by not returning past EOF if uncompressed. */
if (!args.compression)
count = min_t(u64, count, args.len);
ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
cached_state, disk_bytenr,
disk_io_size, count,
args.compression, iov, cmd);
goto out_acct;
}
out_free:
kfree(iov);
out_acct:
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
return ret;
}
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
switch (cmd->cmd_op) {
case BTRFS_IOC_ENCODED_READ:
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_ENCODED_READ_32:
#endif
return btrfs_uring_encoded_read(cmd, issue_flags);
}
return -EINVAL;
}
static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
{
struct btrfs_root *root;
struct btrfs_ioctl_subvol_wait args = { 0 };
signed long sched_ret;
int refs;
u64 root_flags;
bool wait_for_deletion = false;
bool found = false;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
switch (args.mode) {
case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
/*
* Wait for the first one deleted that waits until all previous
* are cleaned.
*/
spin_lock(&fs_info->trans_lock);
if (!list_empty(&fs_info->dead_roots)) {
root = list_last_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
args.subvolid = btrfs_root_id(root);
found = true;
}
spin_unlock(&fs_info->trans_lock);
if (!found)
return -ENOENT;
fallthrough;
case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
BTRFS_LAST_FREE_OBJECTID < args.subvolid)
return -EINVAL;
break;
case BTRFS_SUBVOL_SYNC_COUNT:
spin_lock(&fs_info->trans_lock);
args.count = list_count_nodes(&fs_info->dead_roots);
spin_unlock(&fs_info->trans_lock);
if (copy_to_user(argp, &args, sizeof(args)))
return -EFAULT;
return 0;
case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
spin_lock(&fs_info->trans_lock);
/* Last in the list was deleted first. */
if (!list_empty(&fs_info->dead_roots)) {
root = list_last_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
args.subvolid = btrfs_root_id(root);
} else {
args.subvolid = 0;
}
spin_unlock(&fs_info->trans_lock);
if (copy_to_user(argp, &args, sizeof(args)))
return -EFAULT;
return 0;
case BTRFS_SUBVOL_SYNC_PEEK_LAST:
spin_lock(&fs_info->trans_lock);
/* First in the list was deleted last. */
if (!list_empty(&fs_info->dead_roots)) {
root = list_first_entry(&fs_info->dead_roots,
struct btrfs_root, root_list);
args.subvolid = btrfs_root_id(root);
} else {
args.subvolid = 0;
}
spin_unlock(&fs_info->trans_lock);
if (copy_to_user(argp, &args, sizeof(args)))
return -EFAULT;
return 0;
default:
return -EINVAL;
}
/* 32bit limitation: fs_roots_radix key is not wide enough. */
if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
return -EOVERFLOW;
while (1) {
/* Wait for the specific one. */
if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
return -EINTR;
refs = -1;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)args.subvolid);
if (root) {
spin_lock(&root->root_item_lock);
refs = btrfs_root_refs(&root->root_item);
root_flags = btrfs_root_flags(&root->root_item);
spin_unlock(&root->root_item_lock);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
up_read(&fs_info->subvol_sem);
/* Subvolume does not exist. */
if (!root)
return -ENOENT;
/* Subvolume not deleted at all. */
if (refs > 0)
return -EEXIST;
/* We've waited and now the subvolume is gone. */
if (wait_for_deletion && refs == -1) {
/* Return the one we waited for as the last one. */
if (copy_to_user(argp, &args, sizeof(args)))
return -EFAULT;
return 0;
}
/* Subvolume not found on the first try (deleted or never existed). */
if (refs == -1)
return -ENOENT;
wait_for_deletion = true;
ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
sched_ret = schedule_timeout_interruptible(HZ);
/* Early wake up or error. */
if (sched_ret != 0)
return -EINTR;
}
return 0;
}
long btrfs_ioctl(struct file *file, unsigned int long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg) cmd, unsigned long arg)
{ {
@ -4811,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN_STATUS: case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(fs_info, argp); return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT: case BTRFS_IOC_QUOTA_RESCAN_WAIT:
return btrfs_ioctl_quota_rescan_wait(fs_info, argp); return btrfs_ioctl_quota_rescan_wait(fs_info);
case BTRFS_IOC_DEV_REPLACE: case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp); return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES: case BTRFS_IOC_GET_SUPPORTED_FEATURES:
@ -4840,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_ENCODED_WRITE_32: case BTRFS_IOC_ENCODED_WRITE_32:
return btrfs_ioctl_encoded_write(file, argp, true); return btrfs_ioctl_encoded_write(file, argp, true);
#endif #endif
case BTRFS_IOC_SUBVOL_SYNC_WAIT:
return btrfs_ioctl_subvol_sync(fs_info, argp);
} }
return -ENOTTY; return -ENOTTY;

View File

@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid); int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs); struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
void btrfs_uring_read_extent_endio(void *ctx, int err);
#endif #endif

View File

@ -161,21 +161,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
return 0; return 0;
} }
/*
* Try-lock for write.
*
* Return 1 if the rwlock has been taken, 0 otherwise
*/
int btrfs_try_tree_write_lock(struct extent_buffer *eb)
{
if (down_write_trylock(&eb->lock)) {
btrfs_set_eb_lock_owner(eb, current->pid);
trace_btrfs_try_tree_write_lock(eb);
return 1;
}
return 0;
}
/* /*
* Release read lock. * Release read lock.
*/ */

View File

@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);

View File

@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace); kfree(workspace);
} }
struct list_head *lzo_alloc_workspace(unsigned int level) struct list_head *lzo_alloc_workspace(void)
{ {
struct workspace *workspace; struct workspace *workspace;

View File

@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup; return qgroup;
} }
static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
struct btrfs_qgroup *qgroup)
{ {
struct btrfs_qgroup_list *list; struct btrfs_qgroup_list *list;
@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT; return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree); rb_erase(&qgroup->node, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup); __del_qgroup_rb(qgroup);
return 0; return 0;
} }
@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
/* /*
* If a qgroup exists for a subvolume ID, it is possible * If a qgroup exists for a subvolume ID, it is possible
* that subvolume has been deleted, in which case * that subvolume has been deleted, in which case
* re-using that ID would lead to incorrect accounting. * reusing that ID would lead to incorrect accounting.
* *
* Ensure that we skip any such subvol ids. * Ensure that we skip any such subvol ids.
* *
@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) { while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node); qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree); rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup); __del_qgroup_rb(qgroup);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup); btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup); kfree(qgroup);
} }
@ -2002,26 +2001,26 @@ out:
*/ */
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record) struct btrfs_qgroup_extent_record *record,
u64 bytenr)
{ {
struct btrfs_qgroup_extent_record *existing, *ret; struct btrfs_qgroup_extent_record *existing, *ret;
const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits); const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info)) if (!btrfs_qgroup_full_accounting(fs_info))
return 1; return 1;
#if BITS_PER_LONG == 32 #if BITS_PER_LONG == 32
if (record->bytenr >= MAX_LFS_FILESIZE) { if (bytenr >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info, btrfs_err_rl(fs_info,
"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit", "qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
record->bytenr); bytenr);
btrfs_err_32bit_limit(fs_info); btrfs_err_32bit_limit(fs_info);
return -EOVERFLOW; return -EOVERFLOW;
} }
#endif #endif
lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
trace_btrfs_qgroup_trace_extent(fs_info, record);
xa_lock(&delayed_refs->dirty_extents); xa_lock(&delayed_refs->dirty_extents);
existing = xa_load(&delayed_refs->dirty_extents, index); existing = xa_load(&delayed_refs->dirty_extents, index);
@ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
* transaction committing, but not now as qgroup accounting will be wrong again. * transaction committing, but not now as qgroup accounting will be wrong again.
*/ */
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord) struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr)
{ {
struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_backref_walk_ctx ctx = {
.bytenr = bytenr,
.fs_info = fs_info,
};
int ret; int ret;
if (!btrfs_qgroup_full_accounting(trans->fs_info)) if (!btrfs_qgroup_full_accounting(fs_info))
return 0; return 0;
/* /*
* We are always called in a context where we are already holding a * We are always called in a context where we are already holding a
@ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/ */
ASSERT(trans != NULL); ASSERT(trans != NULL);
if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0; return 0;
ctx.bytenr = qrecord->bytenr;
ctx.fs_info = trans->fs_info;
ret = btrfs_find_all_roots(&ctx, true); ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) { if (ret < 0) {
qgroup_mark_inconsistent(trans->fs_info); qgroup_mark_inconsistent(fs_info);
btrfs_warn(trans->fs_info, btrfs_warn(fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent", "error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret); ret);
return 0; return 0;
@ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
{ {
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record; struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
const unsigned long index = (bytenr >> fs_info->sectorsize_bits); const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
int ret; int ret;
@ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record) if (!record)
return -ENOMEM; return -ENOMEM;
if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) { if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record); kfree(record);
return -ENOMEM; return -ENOMEM;
} }
delayed_refs = &trans->transaction->delayed_refs;
record->bytenr = bytenr;
record->num_bytes = num_bytes; record->num_bytes = num_bytes;
record->old_roots = NULL;
spin_lock(&delayed_refs->lock); ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
spin_unlock(&delayed_refs->lock);
if (ret) { if (ret) {
/* Clean up if insertion fails or item exists. */ /* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index); xa_release(&delayed_refs->dirty_extents, index);
kfree(record); kfree(record);
return 0; return 0;
} }
return btrfs_qgroup_trace_extent_post(trans, record); return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
} }
/* /*
@ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!extent_buffer_uptodate(root_eb)) { if (!extent_buffer_uptodate(root_eb)) {
struct btrfs_tree_parent_check check = { struct btrfs_tree_parent_check check = {
.has_first_key = false,
.transid = root_gen, .transid = root_gen,
.level = root_level .level = root_level
}; };
@ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs; delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip; qgroup_to_skip = delayed_refs->qgroup_to_skip;
xa_for_each(&delayed_refs->dirty_extents, index, record) { xa_for_each(&delayed_refs->dirty_extents, index, record) {
const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
num_dirty_extents++; num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record); trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
if (!ret && !(fs_info->qgroup_flags & if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_backref_walk_ctx ctx = { 0 };
ctx.bytenr = record->bytenr; ctx.bytenr = bytenr;
ctx.fs_info = fs_info; ctx.fs_info = fs_info;
/* /*
@ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
ulist_del(record->old_roots, qgroup_to_skip, ulist_del(record->old_roots, qgroup_to_skip,
0); 0);
} }
ret = btrfs_qgroup_account_extent(trans, record->bytenr, ret = btrfs_qgroup_account_extent(trans, bytenr,
record->num_bytes, record->num_bytes,
record->old_roots, record->old_roots,
new_roots); new_roots);
@ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0; return 0;
} }
btrfs_run_delayed_iputs(root->fs_info);
btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_start_delalloc_snapshot(root, true); ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0) if (ret < 0)
goto out; goto out;
btrfs_wait_ordered_extents(root, U64_MAX, NULL); btrfs_wait_ordered_extents(root, U64_MAX, NULL);
/*
* After waiting for ordered extents run delayed iputs in order to free
* space from unlinked files before committing the current transaction,
* as ordered extents may have been holding the last reference of an
* inode and they add a delayed iput when they complete.
*/
btrfs_run_delayed_iputs(root->fs_info);
btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_commit_current_transaction(root); ret = btrfs_commit_current_transaction(root);
out: out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
@ -4687,8 +4691,7 @@ out:
* BOTH POINTERS ARE BEFORE TREE SWAP * BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree * @last_snapshot: last snapshot generation of the subvolume tree
*/ */
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_root *subvol_root,
struct btrfs_block_group *bg, struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot, struct extent_buffer *reloc_parent, int reloc_slot,
@ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
xa_destroy(&trans->delayed_refs.dirty_extents); xa_destroy(&trans->delayed_refs.dirty_extents);
} }
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
{
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
return;
if (!is_fstree(root))
return;
btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
}
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta) const struct btrfs_squota_delta *delta)
{ {

View File

@ -127,7 +127,12 @@ struct btrfs_inode;
* Record a dirty extent, and info qgroup to update quota on it * Record a dirty extent, and info qgroup to update quota on it
*/ */
struct btrfs_qgroup_extent_record { struct btrfs_qgroup_extent_record {
u64 bytenr; /*
* The bytenr of the extent is given by its index in the dirty_extents
* xarray of struct btrfs_delayed_ref_root left shifted by
* fs_info->sectorsize_bits.
*/
u64 num_bytes; u64 num_bytes;
/* /*
@ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_trace_extent_nolock( int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info, struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_qgroup_extent_record *record); struct btrfs_qgroup_extent_record *record,
u64 bytenr);
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord); struct btrfs_qgroup_extent_record *qrecord,
u64 bytenr);
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes); u64 num_bytes);
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@ -432,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks); struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_root *subvol_root,
struct btrfs_block_group *bg, struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot, struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot, struct extent_buffer *reloc_parent, int reloc_slot,
@ -442,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb); struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info); bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta); const struct btrfs_squota_delta *delta);

View File

@ -13,6 +13,39 @@
#include "volumes.h" #include "volumes.h"
#include "print-tree.h" #include "print-tree.h"
static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
const struct btrfs_key *oldkey,
u64 newlen, u64 frontpad)
{
struct btrfs_stripe_extent *extent;
struct extent_buffer *leaf;
int slot;
size_t item_size;
struct btrfs_key newkey = {
.objectid = oldkey->objectid + frontpad,
.type = BTRFS_RAID_STRIPE_KEY,
.offset = newlen,
};
ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
leaf = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size(leaf, slot);
extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
struct btrfs_raid_stride *stride = &extent->strides[i];
u64 phys;
phys = btrfs_raid_stride_physical(leaf, stride);
btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
}
btrfs_set_item_key_safe(trans, path, &newkey);
}
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{ {
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_info *fs_info = trans->fs_info;
@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
while (1) { while (1) {
key.objectid = start; key.objectid = start;
key.type = BTRFS_RAID_STRIPE_KEY; key.type = BTRFS_RAID_STRIPE_KEY;
key.offset = length; key.offset = 0;
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
if (ret < 0) if (ret < 0)
break; break;
if (ret > 0) {
ret = 0; if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
if (path->slots[0] == 0)
break;
path->slots[0]--; path->slots[0]--;
}
leaf = path->nodes[0]; leaf = path->nodes[0];
slot = path->slots[0]; slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_item_key_to_cpu(leaf, &key, slot);
found_start = key.objectid; found_start = key.objectid;
found_end = found_start + key.offset; found_end = found_start + key.offset;
ret = 0;
if (key.type != BTRFS_RAID_STRIPE_KEY)
break;
/* That stripe ends before we start, we're done. */ /* That stripe ends before we start, we're done. */
if (found_end <= start) if (found_end <= start)
@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
trace_btrfs_raid_extent_delete(fs_info, start, end, trace_btrfs_raid_extent_delete(fs_info, start, end,
found_start, found_end); found_start, found_end);
ASSERT(found_start >= start && found_end <= end); /*
* The stripe extent starts before the range we want to delete:
*
* |--- RAID Stripe Extent ---|
* |--- keep ---|--- drop ---|
*
* This means we have to duplicate the tree item, truncate the
* length to the new size and then re-insert the item.
*/
if (found_start < start) {
u64 diff = start - found_start;
btrfs_partially_delete_raid_extent(trans, path, &key,
diff, 0);
break;
}
/*
* The stripe extent ends after the range we want to delete:
*
* |--- RAID Stripe Extent ---|
* |--- drop ---|--- keep ---|
*
* This means we have to duplicate the tree item, truncate the
* length to the new size and then re-insert the item.
*/
if (found_end > end) {
u64 diff = found_end - end;
btrfs_partially_delete_raid_extent(trans, path, &key,
diff, diff);
break;
}
ret = btrfs_del_item(trans, stripe_root, path); ret = btrfs_del_item(trans, stripe_root, path);
if (ret) if (ret)
break; break;
@ -108,7 +175,8 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
return ret; return ret;
} }
static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, EXPORT_FOR_TESTS
int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_io_context *bioc) struct btrfs_io_context *bioc)
{ {
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_info *fs_info = trans->fs_info;
@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
found_end = found_logical + found_length; found_end = found_logical + found_length;
if (found_logical > end) { if (found_logical > end) {
ret = -ENOENT; ret = -ENODATA;
goto out; goto out;
} }
@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
} }
/* If we're here, we haven't found the requested devid in the stripe. */ /* If we're here, we haven't found the requested devid in the stripe. */
ret = -ENOENT; ret = -ENODATA;
out: out:
if (ret > 0) if (ret > 0)
ret = -ENOENT; ret = -ENODATA;
if (ret && ret != -EIO && !stripe->rst_search_commit_root) { if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
btrfs_debug(fs_info, btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",

View File

@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent); struct btrfs_ordered_extent *ordered_extent);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_io_context *bioc);
#endif
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type) u64 map_type)
{ {

View File

@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list)
static void assert_rbio(struct btrfs_raid_bio *rbio) static void assert_rbio(struct btrfs_raid_bio *rbio)
{ {
if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) || if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return; return;
/* /*

View File

@ -1244,7 +1244,7 @@ again:
* The real subtree rescan is delayed until we have new * The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit. * CoW on the subtree root node before transaction commit.
*/ */
ret = btrfs_qgroup_add_swapped_blocks(trans, dest, ret = btrfs_qgroup_add_swapped_blocks(dest,
rc->block_group, parent, slot, rc->block_group, parent, slot,
path->nodes[level], path->slots[level], path->nodes[level], path->slots[level],
last_snapshot); last_snapshot);

View File

@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe)
stripe->bg->start + stripe->bg->length - stripe->logical); stripe->bg->start + stripe->bg->length - stripe->logical);
} }
static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct scrub_stripe *stripe)
{ {
struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL; struct btrfs_bio *bbio = NULL;
@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
&stripe_len, &bioc, &io_stripe, &mirror); &stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc); btrfs_put_bioc(bioc);
if (err < 0) { if (err < 0) {
if (err != -ENODATA) {
/*
* Earlier btrfs_get_raid_extent_offset()
* returned -ENODATA, which means there's
* no entry for the corresponding range
* in the stripe tree. But if it's in
* the extent tree, then it's a preallocated
* extent and not an error.
*/
set_bit(i, &stripe->io_error_bitmap); set_bit(i, &stripe->io_error_bitmap);
set_bit(i, &stripe->error_bitmap); set_bit(i, &stripe->error_bitmap);
}
continue; continue;
} }
@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
scrub_submit_extent_sector_read(sctx, stripe); scrub_submit_extent_sector_read(stripe);
return; return;
} }
@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes); ASSERT(sctx->raid56_data_stripes);
/* /*
* For data stripe search, we cannot re-use the same extent/csum paths, * For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus * as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths. * we have to use our own extent/csum paths.
*/ */
@ -2103,7 +2112,6 @@ out:
*/ */
static int scrub_simple_mirror(struct scrub_ctx *sctx, static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg, struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length, u64 logical_start, u64 logical_length,
struct btrfs_device *device, struct btrfs_device *device,
u64 physical, int mirror_num) u64 physical, int mirror_num)
@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub * just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe. * this stripe.
*/ */
ret = scrub_simple_mirror(sctx, bg, map, cur_logical, ret = scrub_simple_mirror(sctx, bg, cur_logical,
BTRFS_STRIPE_LEN, device, cur_physical, BTRFS_STRIPE_LEN, device, cur_physical,
mirror_num); mirror_num);
if (ret) if (ret)
@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Offset inside the chunk */ /* Offset inside the chunk */
u64 offset; u64 offset;
u64 stripe_logical; u64 stripe_logical;
int stop_loop = 0;
/* Extent_path should be released by now. */ /* Extent_path should be released by now. */
ASSERT(sctx->extent_path.nodes[0] == NULL); ASSERT(sctx->extent_path.nodes[0] == NULL);
@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Only @physical and @mirror_num needs to calculated using * Only @physical and @mirror_num needs to calculated using
* @stripe_index. * @stripe_index.
*/ */
ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
scrub_dev, map->stripes[stripe_index].physical, scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1); stripe_index + 1);
offset = 0; offset = 0;
@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* We can reuse scrub_simple_mirror() here, as the repair part * We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num. * is still based on @mirror_num.
*/ */
ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1); scrub_dev, physical, 1);
if (ret < 0) if (ret < 0)
goto out; goto out;
@ -2370,14 +2377,8 @@ next:
logical += increment; logical += increment;
physical += BTRFS_STRIPE_LEN; physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock); spin_lock(&sctx->stat_lock);
if (stop_loop)
sctx->stat.last_physical =
map->stripes[stripe_index].physical + dev_stripe_len;
else
sctx->stat.last_physical = physical; sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock); spin_unlock(&sctx->stat_lock);
if (stop_loop)
break;
} }
out: out:
ret2 = flush_scrub_stripes(sctx); ret2 = flush_scrub_stripes(sctx);

View File

@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
return ret; return ret;
} }
typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index, typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
struct fs_path *p,
void *ctx);
/* /*
* Helper function to iterate the entries in ONE btrfs_inode_ref or * Helper function to iterate the entries in ONE btrfs_inode_ref or
@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
u32 name_len; u32 name_len;
char *start; char *start;
int ret = 0; int ret = 0;
int num = 0;
int index;
u64 dir; u64 dir;
unsigned long name_off; unsigned long name_off;
unsigned long elem_size; unsigned long elem_size;
@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iref = (struct btrfs_inode_ref *)(ptr + cur); iref = (struct btrfs_inode_ref *)(ptr + cur);
name_len = btrfs_inode_ref_name_len(eb, iref); name_len = btrfs_inode_ref_name_len(eb, iref);
name_off = (unsigned long)(iref + 1); name_off = (unsigned long)(iref + 1);
index = btrfs_inode_ref_index(eb, iref);
dir = found_key->offset; dir = found_key->offset;
} else { } else {
extref = (struct btrfs_inode_extref *)(ptr + cur); extref = (struct btrfs_inode_extref *)(ptr + cur);
name_len = btrfs_inode_extref_name_len(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref);
name_off = (unsigned long)&extref->name; name_off = (unsigned long)&extref->name;
index = btrfs_inode_extref_index(eb, extref);
dir = btrfs_inode_extref_parent(eb, extref); dir = btrfs_inode_extref_parent(eb, extref);
} }
@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
} }
cur += elem_size + name_len; cur += elem_size + name_len;
ret = iterate(num, dir, index, p, ctx); ret = iterate(dir, p, ctx);
if (ret) if (ret)
goto out; goto out;
num++;
} }
out: out:
@ -1227,8 +1220,7 @@ out:
return ret; return ret;
} }
static int __copy_first_ref(int num, u64 dir, int index, static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx)
struct fs_path *p, void *ctx)
{ {
int ret; int ret;
struct fs_path *pt = ctx; struct fs_path *pt = ctx;
@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref, struct recorded_ref *parent_ref,
const bool is_orphan) const bool is_orphan)
{ {
struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
struct btrfs_path *path; struct btrfs_path *path;
struct btrfs_key key; struct btrfs_key key;
struct btrfs_key di_key; struct btrfs_key di_key;
@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out; goto out;
} }
di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len); parent_ref->name_len);
if (!di) { if (!di) {
ret = 0; ret = 0;
@ -4708,8 +4699,7 @@ out:
return ret; return ret;
} }
static int record_new_ref_if_needed(int num, u64 dir, int index, static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
struct fs_path *name, void *ctx)
{ {
int ret = 0; int ret = 0;
struct send_ctx *sctx = ctx; struct send_ctx *sctx = ctx;
@ -4738,8 +4728,7 @@ out:
return ret; return ret;
} }
static int record_deleted_ref_if_needed(int num, u64 dir, int index, static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
struct fs_path *name, void *ctx)
{ {
int ret = 0; int ret = 0;
struct send_ctx *sctx = ctx; struct send_ctx *sctx = ctx;
@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really * Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf. * reading into send_buf.
*/ */
ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
disk_bytenr, disk_num_bytes, disk_bytenr, disk_num_bytes,
sctx->send_buf_pages + sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT)); (data_offset >> PAGE_SHIFT),
NULL);
if (ret) if (ret)
goto out; goto out;
@ -8135,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
* making it RW. This also protects against deletion. * making it RW. This also protects against deletion.
*/ */
spin_lock(&send_root->root_item_lock); spin_lock(&send_root->root_item_lock);
if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) { /*
* Unlikely but possible, if the subvolume is marked for deletion but
* is slow to remove the directory entry, send can still be started.
*/
if (btrfs_root_dead(send_root)) {
spin_unlock(&send_root->root_item_lock);
return -EPERM;
}
/* Userspace tools do the checks and warn the user if it's not RO. */
if (!btrfs_root_readonly(send_root)) {
spin_unlock(&send_root->root_item_lock);
return -EPERM;
}
if (send_root->dedupe_in_progress) {
dedupe_in_progress_warn(send_root); dedupe_in_progress_warn(send_root);
spin_unlock(&send_root->root_item_lock); spin_unlock(&send_root->root_item_lock);
return -EAGAIN; return -EAGAIN;
@ -8143,15 +8146,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
send_root->send_in_progress++; send_root->send_in_progress++;
spin_unlock(&send_root->root_item_lock); spin_unlock(&send_root->root_item_lock);
/*
* Userspace tools do the checks and warn the user if it's
* not RO.
*/
if (!btrfs_root_readonly(send_root)) {
ret = -EPERM;
goto out;
}
/* /*
* Check that we don't overflow at later allocations, we request * Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside * clone_sources_count + 1 items, and compare to unsigned long inside
@ -8217,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
} }
sctx->send_root = send_root; sctx->send_root = send_root;
/*
* Unlikely but possible, if the subvolume is marked for deletion but
* is slow to remove the directory entry, send can still be started
*/
if (btrfs_root_dead(sctx->send_root)) {
ret = -EPERM;
goto out;
}
sctx->clone_roots_cnt = arg->clone_sources_count; sctx->clone_roots_cnt = arg->clone_sources_count;
if (sctx->proto >= 2) { if (sctx->proto >= 2) {

View File

@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */ /* Conditional support for the upcoming protocol version. */
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
#define BTRFS_SEND_STREAM_VERSION 3 #define BTRFS_SEND_STREAM_VERSION 3
#else #else
#define BTRFS_SEND_STREAM_VERSION 2 #define BTRFS_SEND_STREAM_VERSION 2

View File

@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* If we are freeing inodes, we want to make sure all delayed iputs have * If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and * completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not * thus have been truncated and freed up space. But again this space is not
* immediately re-usable, it comes in the form of a delayed ref, which must be * immediately reusable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed. * run and then the transaction must be committed.
* *
* COMMIT_TRANS * COMMIT_TRANS
@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock); spin_unlock(&space_info->lock);
} }
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, static void wait_reserve_ticket(struct btrfs_space_info *space_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket) struct reserve_ticket *ticket)
{ {
@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL: case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket); wait_reserve_ticket(space_info, ticket);
break; break;
case BTRFS_RESERVE_FLUSH_LIMIT: case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket, priority_reclaim_metadata_space(fs_info, space_info, ticket,
@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size; return unalloc < data_chunk_size;
} }
static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info, static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
struct btrfs_space_info *space_info, int raid)
{ {
struct btrfs_block_group *bg; struct btrfs_block_group *bg;
int thresh_pct; int thresh_pct;
@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
if (!btrfs_should_periodic_reclaim(space_info)) if (!btrfs_should_periodic_reclaim(space_info))
continue; continue;
for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
do_reclaim_sweep(fs_info, space_info, raid); do_reclaim_sweep(space_info, raid);
} }
} }

View File

@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
spin_lock_init(&ret->lock); spin_lock_init(&ret->lock);
if (type == BTRFS_SUBPAGE_METADATA) { if (type == BTRFS_SUBPAGE_METADATA)
atomic_set(&ret->eb_refs, 0); atomic_set(&ret->eb_refs, 0);
} else { else
atomic_set(&ret->readers, 0); atomic_set(&ret->nr_locked, 0);
atomic_set(&ret->writers, 0);
}
return ret; return ret;
} }
@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
__start_bit; \ __start_bit; \
}) })
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
unsigned long flags;
btrfs_subpage_assert(fs_info, folio, start, len);
spin_lock_irqsave(&subpage->lock, flags);
/*
* Even though it's just for reading the page, no one should have
* locked the subpage range.
*/
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
atomic_add(nbits, &subpage->readers);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = len >> fs_info->sectorsize_bits;
unsigned long flags;
bool is_data;
bool last;
btrfs_subpage_assert(fs_info, folio, start, len);
is_data = is_data_inode(BTRFS_I(folio->mapping->host));
spin_lock_irqsave(&subpage->lock, flags);
/* The range should have already been locked. */
ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
ASSERT(atomic_read(&subpage->readers) >= nbits);
bitmap_clear(subpage->bitmaps, start_bit, nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
/*
* For data we need to unlock the page if the last read has finished.
*
* And please don't replace @last with atomic_sub_and_test() call
* inside if () condition.
* As we want the atomic_sub_and_test() to be always executed.
*/
if (is_data && last)
folio_unlock(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{ {
u64 orig_start = *start; u64 orig_start = *start;
@ -295,27 +237,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
orig_start + orig_len) - *start; orig_start + orig_len) - *start;
} }
static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
int ret;
btrfs_subpage_assert(fs_info, folio, start, len);
spin_lock_irqsave(&subpage->lock, flags);
ASSERT(atomic_read(&subpage->readers) == 0);
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->writers);
ASSERT(ret == nbits);
spin_unlock_irqrestore(&subpage->lock, flags);
}
static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len) struct folio *folio, u64 start, u32 len)
{ {
struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage *subpage = folio_get_private(folio);
@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
* extent_clear_unlock_delalloc() for compression path. * extent_clear_unlock_delalloc() for compression path.
* *
* This @locked_page is locked by plain lock_page(), thus its * This @locked_page is locked by plain lock_page(), thus its
* subpage::writers is 0. Handle them in a special way. * subpage::locked is 0. Handle them in a special way.
*/ */
if (atomic_read(&subpage->writers) == 0) { if (atomic_read(&subpage->nr_locked) == 0) {
spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock_irqrestore(&subpage->lock, flags);
return true; return true;
} }
@ -345,39 +267,12 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
clear_bit(bit, subpage->bitmaps); clear_bit(bit, subpage->bitmaps);
cleared++; cleared++;
} }
ASSERT(atomic_read(&subpage->writers) >= cleared); ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
last = atomic_sub_and_test(cleared, &subpage->writers); last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock_irqrestore(&subpage->lock, flags);
return last; return last;
} }
/*
* Lock a folio for delalloc page writeback.
*
* Return -EAGAIN if the page is not properly initialized.
* Return 0 with the page locked, and writer counter updated.
*
* Even with 0 returned, the page still need extra check to make sure
* it's really the correct page, as the caller is using
* filemap_get_folios_contig(), which can race with page invalidating.
*/
int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
folio_lock(folio);
return 0;
}
folio_lock(folio);
if (!folio_test_private(folio) || !folio_get_private(folio)) {
folio_unlock(folio);
return -EAGAIN;
}
btrfs_subpage_clamp_range(folio, &start, &len);
btrfs_subpage_start_writer(fs_info, folio, start, len);
return 0;
}
/* /*
* Handle different locked folios: * Handle different locked folios:
* *
@ -394,7 +289,7 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
* bitmap, reduce the writer lock number, and unlock the page if that's * bitmap, reduce the writer lock number, and unlock the page if that's
* the last locked range. * the last locked range.
*/ */
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len) struct folio *folio, u64 start, u32 len)
{ {
struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage *subpage = folio_get_private(folio);
@ -408,23 +303,23 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
/* /*
* For subpage case, there are two types of locked page. With or * For subpage case, there are two types of locked page. With or
* without writers number. * without locked number.
* *
* Since we own the page lock, no one else could touch subpage::writers * Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock. * and we are safe to do several atomic operations without spinlock.
*/ */
if (atomic_read(&subpage->writers) == 0) { if (atomic_read(&subpage->nr_locked) == 0) {
/* No writers, locked by plain lock_page(). */ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio); folio_unlock(folio);
return; return;
} }
btrfs_subpage_clamp_range(folio, &start, &len); btrfs_subpage_clamp_range(folio, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
folio_unlock(folio); folio_unlock(folio);
} }
void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap) struct folio *folio, unsigned long bitmap)
{ {
struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage *subpage = folio_get_private(folio);
@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
int cleared = 0; int cleared = 0;
int bit; int bit;
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { if (!btrfs_is_subpage(fs_info, folio->mapping)) {
folio_unlock(folio); folio_unlock(folio);
return; return;
} }
if (atomic_read(&subpage->writers) == 0) { if (atomic_read(&subpage->nr_locked) == 0) {
/* No writers, locked by plain lock_page(). */ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio); folio_unlock(folio);
return; return;
} }
@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++; cleared++;
} }
ASSERT(atomic_read(&subpage->writers) >= cleared); ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
last = atomic_sub_and_test(cleared, &subpage->writers); last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock_irqrestore(&subpage->lock, flags);
if (last) if (last)
folio_unlock(folio); folio_unlock(folio);
@ -776,7 +671,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* This populates the involved subpage ranges so that subpage helpers can * This populates the involved subpage ranges so that subpage helpers can
* properly unlock them. * properly unlock them.
*/ */
void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len) struct folio *folio, u64 start, u32 len)
{ {
struct btrfs_subpage *subpage; struct btrfs_subpage *subpage;
@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
/* Target range should not yet be locked. */ /* Target range should not yet be locked. */
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits)); ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits); bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->writers); ret = atomic_add_return(nbits, &subpage->nr_locked);
ASSERT(ret <= fs_info->sectors_per_page); ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock_irqrestore(&subpage->lock, flags);
} }
/*
* Find any subpage writer locked range inside @folio, starting at file offset
* @search_start. The caller should ensure the folio is locked.
*
* Return true and update @found_start_ret and @found_len_ret to the first
* writer locked range.
* Return false if there is no writer locked range.
*/
bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 search_start,
u64 *found_start_ret, u32 *found_len_ret)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const u32 sectors_per_page = fs_info->sectors_per_page;
const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
locked, search_start, len);
const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
unsigned long flags;
int first_zero;
int first_set;
bool found = false;
ASSERT(folio_test_locked(folio));
spin_lock_irqsave(&subpage->lock, flags);
first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
if (first_set >= locked_bitmap_end)
goto out;
found = true;
*found_start_ret = folio_pos(folio) +
((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
/*
* Since @first_set is ensured to be smaller than locked_bitmap_end
* here, @found_start_ret should be inside the folio.
*/
ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
*found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
out:
spin_unlock_irqrestore(&subpage->lock, flags);
return found;
}
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ #define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \ { \
const int sectors_per_page = fs_info->sectors_per_page; \ const int sectors_per_page = fs_info->sectors_per_page; \

View File

@ -45,14 +45,6 @@ enum {
struct btrfs_subpage { struct btrfs_subpage {
/* Common members for both data and metadata pages */ /* Common members for both data and metadata pages */
spinlock_t lock; spinlock_t lock;
/*
* Both data and metadata needs to track how many readers are for the
* page.
* Data relies on @readers to unlock the page when last reader finished.
* While metadata doesn't need page unlock, it needs to prevent
* page::private get cleared before the last end_page_read().
*/
atomic_t readers;
union { union {
/* /*
* Structures only used by metadata * Structures only used by metadata
@ -62,8 +54,12 @@ struct btrfs_subpage {
*/ */
atomic_t eb_refs; atomic_t eb_refs;
/* Structures only used by data */ /*
atomic_t writers; * Structures only used by data,
*
* How many sectors inside the page is locked.
*/
atomic_t nr_locked;
}; };
unsigned long bitmaps[]; unsigned long bitmaps[];
}; };
@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len); struct folio *folio, u64 start, u32 len);
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len); struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap); struct folio *folio, unsigned long bitmap);
bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 search_start,
u64 *found_start_ret, u32 *found_len_ret);
/* /*
* Template for subpage related operations. * Template for subpage related operations.
* *

View File

@ -28,7 +28,6 @@
#include <linux/btrfs.h> #include <linux/btrfs.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/fs_parser.h> #include <linux/fs_parser.h>
#include <linux/swap.h>
#include "messages.h" #include "messages.h"
#include "delayed-inode.h" #include "delayed-inode.h"
#include "ctree.h" #include "ctree.h"
@ -946,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
} }
static int btrfs_fill_super(struct super_block *sb, static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices, struct btrfs_fs_devices *fs_devices)
void *data)
{ {
struct inode *inode; struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@ -971,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err; return err;
} }
err = open_ctree(sb, fs_devices, (char *)data); err = open_ctree(sb, fs_devices);
if (err) { if (err) {
btrfs_err(fs_info, "open_ctree failed"); btrfs_err(fs_info, "open_ctree failed");
return err; return err;
@ -1893,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices, NULL); ret = btrfs_fill_super(sb, fs_devices);
} }
if (ret) { if (ret) {
@ -2257,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
if (IS_ERR_OR_NULL(device)) { if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex); mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
ret = PTR_ERR(device); ret = PTR_ERR(device);
else
ret = 0;
break; break;
} }
ret = !(device->fs_devices->num_devices == ret = !(device->fs_devices->num_devices ==
@ -2396,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
trace_btrfs_extent_map_shrinker_count(fs_info, nr); trace_btrfs_extent_map_shrinker_count(fs_info, nr);
/*
* Only report the real number for DEBUG builds, as there are reports of
* serious performance degradation caused by too frequent shrinks.
*/
if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
return nr; return nr;
return 0;
} }
static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
@ -2410,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_fs_info *fs_info = btrfs_sb(sb);
/* btrfs_free_extent_maps(fs_info, nr_to_scan);
* We may be called from any task trying to allocate memory and we don't
* want to slow it down with scanning and dropping extent maps. It would
* also cause heavy lock contention if many tasks concurrently enter
* here. Therefore only allow kswapd tasks to scan and drop extent maps.
*/
if (!current_is_kswapd())
return 0;
return btrfs_free_extent_maps(fs_info, nr_to_scan); /* The extent map shrinker runs asynchronously, so always return 0. */
return 0;
} }
static const struct super_operations btrfs_super_ops = { static const struct super_operations btrfs_super_ops = {

View File

@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store); btrfs_bg_reclaim_threshold_store);
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
static ssize_t btrfs_offload_csum_show(struct kobject *kobj, static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf) struct kobj_attribute *a, char *buf)
{ {
@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid), BTRFS_ATTR_PTR(, temp_fsid),
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum), BTRFS_ATTR_PTR(, offload_csum),
#endif #endif
NULL, NULL,

View File

@ -29,6 +29,7 @@ const char *test_error[] = {
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
[TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
}; };
static const struct super_operations btrfs_test_super_ops = { static const struct super_operations btrfs_test_super_ops = {
@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_free_space_tree(sectorsize, nodesize); ret = btrfs_test_free_space_tree(sectorsize, nodesize);
if (ret) if (ret)
goto out; goto out;
ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
if (ret)
goto out;
} }
} }
ret = btrfs_test_extent_map(); ret = btrfs_test_extent_map();

View File

@ -24,6 +24,7 @@ enum {
TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP, TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP, TEST_ALLOC_CHUNK_MAP,
TEST_ALLOC_IO_CONTEXT,
}; };
extern const char *test_error[]; extern const char *test_error[];
@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void); int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void); struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);

View File

@ -0,0 +1,538 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2024 Western Digital Corporation or its affiliates.
*/
#include <linux/sizes.h>
#include "../fs.h"
#include "../disk-io.h"
#include "../transaction.h"
#include "../volumes.h"
#include "../raid-stripe-tree.h"
#include "btrfs-tests.h"
#define RST_TEST_NUM_DEVICES (2)
#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
u64 devid)
{
struct btrfs_device *dev;
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
if (dev->devid == devid)
return dev;
}
return NULL;
}
/*
* Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
* delete the 1st 32K, making the new start address 1M+32K.
*/
static int test_front_delete(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_io_context *bioc;
struct btrfs_io_stripe io_stripe = { 0 };
u64 map_type = RST_TEST_RAID1_TYPE;
u64 logical = SZ_1M;
u64 len = SZ_64K;
int ret;
bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
if (!bioc) {
test_std_err(TEST_ALLOC_IO_CONTEXT);
ret = -ENOMEM;
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
bioc->map_type = map_type;
bioc->size = len;
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_io_stripe *stripe = &bioc->stripes[i];
stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
if (!stripe->dev) {
test_err("cannot find device with devid %d", i);
ret = -EINVAL;
goto out;
}
stripe->physical = logical + i * SZ_1G;
}
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret) {
test_err("inserting RAID extent failed: %d", ret);
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical) {
test_err("invalid physical address, expected %llu got %llu",
logical, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_64K) {
test_err("invalid stripe length, expected %llu got %llu",
(u64)SZ_64K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed", logical,
logical + SZ_32K);
goto out;
}
len = SZ_32K;
ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed",
logical + SZ_32K, logical + SZ_32K + len);
goto out;
}
if (io_stripe.physical != logical + SZ_32K) {
test_err("invalid physical address, expected %llu, got %llu",
logical + SZ_32K, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_32K) {
test_err("invalid stripe length, expected %llu, got %llu",
(u64)SZ_32K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (!ret) {
ret = -EINVAL;
test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
logical, logical + SZ_32K);
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
out:
btrfs_put_bioc(bioc);
return ret;
}
/*
* Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
* truncate the stripe extent down to 32K.
*/
static int test_tail_delete(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_io_context *bioc;
struct btrfs_io_stripe io_stripe = { 0 };
u64 map_type = RST_TEST_RAID1_TYPE;
u64 logical = SZ_1M;
u64 len = SZ_64K;
int ret;
bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
if (!bioc) {
test_std_err(TEST_ALLOC_IO_CONTEXT);
ret = -ENOMEM;
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
bioc->map_type = map_type;
bioc->size = len;
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_io_stripe *stripe = &bioc->stripes[i];
stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
if (!stripe->dev) {
test_err("cannot find device with devid %d", i);
ret = -EINVAL;
goto out;
}
stripe->physical = logical + i * SZ_1G;
}
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret) {
test_err("inserting RAID extent failed: %d", ret);
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
if (!io_stripe.dev) {
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical) {
test_err("invalid physical address, expected %llu got %llu",
logical, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_64K) {
test_err("invalid stripe length, expected %llu got %llu",
(u64)SZ_64K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
if (ret) {
test_err("deleting RAID extent [%llu, %llu] failed",
logical + SZ_32K, logical + SZ_64K);
goto out;
}
len = SZ_32K;
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical) {
test_err("invalid physical address, expected %llu, got %llu",
logical, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_32K) {
test_err("invalid stripe length, expected %llu, got %llu",
(u64)SZ_32K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical, len);
if (ret)
test_err("deleting RAID extent [%llu, %llu] failed", logical,
logical + len);
out:
btrfs_put_bioc(bioc);
return ret;
}
/*
* Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
* overwrite the whole range giving it new physical address at an offset of 1G.
* The intent of this test is to exercise the 'update_raid_extent_item()'
* function called be btrfs_insert_one_raid_extent().
*/
static int test_create_update_delete(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_io_context *bioc;
struct btrfs_io_stripe io_stripe = { 0 };
u64 map_type = RST_TEST_RAID1_TYPE;
u64 logical = SZ_1M;
u64 len = SZ_64K;
int ret;
bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
if (!bioc) {
test_std_err(TEST_ALLOC_IO_CONTEXT);
ret = -ENOMEM;
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
bioc->map_type = map_type;
bioc->size = len;
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_io_stripe *stripe = &bioc->stripes[i];
stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
if (!stripe->dev) {
test_err("cannot find device with devid %d", i);
ret = -EINVAL;
goto out;
}
stripe->physical = logical + i * SZ_1G;
}
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret) {
test_err("inserting RAID extent failed: %d", ret);
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
if (!io_stripe.dev) {
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical) {
test_err("invalid physical address, expected %llu got %llu",
logical, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_64K) {
test_err("invalid stripe length, expected %llu got %llu",
(u64)SZ_64K, len);
ret = -EINVAL;
goto out;
}
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_io_stripe *stripe = &bioc->stripes[i];
stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
if (!stripe->dev) {
test_err("cannot find device with devid %d", i);
ret = -EINVAL;
goto out;
}
stripe->physical = SZ_1G + logical + i * SZ_1G;
}
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret) {
test_err("updating RAID extent failed: %d", ret);
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical + SZ_1G) {
test_err("invalid physical address, expected %llu, got %llu",
logical + SZ_1G, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_64K) {
test_err("invalid stripe length, expected %llu, got %llu",
(u64)SZ_64K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical, len);
if (ret)
test_err("deleting RAID extent [%llu, %llu] failed", logical,
logical + len);
out:
btrfs_put_bioc(bioc);
return ret;
}
/*
* Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M.
* The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M.
*/
static int test_simple_create_delete(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_io_context *bioc;
struct btrfs_io_stripe io_stripe = { 0 };
u64 map_type = RST_TEST_RAID1_TYPE;
u64 logical = SZ_1M;
u64 len = SZ_64K;
int ret;
bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
if (!bioc) {
test_std_err(TEST_ALLOC_IO_CONTEXT);
ret = -ENOMEM;
goto out;
}
bioc->map_type = map_type;
bioc->size = SZ_64K;
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_io_stripe *stripe = &bioc->stripes[i];
stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
if (!stripe->dev) {
test_err("cannot find device with devid %d", i);
ret = -EINVAL;
goto out;
}
stripe->physical = logical + i * SZ_1G;
}
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret) {
test_err("inserting RAID extent failed: %d", ret);
goto out;
}
io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
if (!io_stripe.dev) {
ret = -EINVAL;
goto out;
}
ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
if (ret) {
test_err("lookup of RAID extent [%llu, %llu] failed", logical,
logical + len);
goto out;
}
if (io_stripe.physical != logical) {
test_err("invalid physical address, expected %llu got %llu",
logical, io_stripe.physical);
ret = -EINVAL;
goto out;
}
if (len != SZ_64K) {
test_err("invalid stripe length, expected %llu got %llu",
(u64)SZ_64K, len);
ret = -EINVAL;
goto out;
}
ret = btrfs_delete_raid_extent(trans, logical, len);
if (ret)
test_err("deleting RAID extent [%llu, %llu] failed", logical,
logical + len);
out:
btrfs_put_bioc(bioc);
return ret;
}
static const test_func_t tests[] = {
test_simple_create_delete,
test_create_update_delete,
test_tail_delete,
test_front_delete,
};
static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
{
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
int ret;
fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize);
if (!fs_info) {
test_std_err(TEST_ALLOC_FS_INFO);
ret = -ENOMEM;
goto out;
}
root = btrfs_alloc_dummy_root(fs_info);
if (IS_ERR(root)) {
test_std_err(TEST_ALLOC_ROOT);
ret = PTR_ERR(root);
goto out;
}
btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
fs_info->stripe_root = root;
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
if (IS_ERR(root->node)) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
btrfs_set_header_nritems(root->node, 0);
root->alloc_bytenr += 2 * nodesize;
for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
struct btrfs_device *dev;
dev = btrfs_alloc_dummy_device(fs_info);
if (IS_ERR(dev)) {
test_err("cannot allocate device");
ret = PTR_ERR(dev);
goto out;
}
dev->devid = i;
}
btrfs_init_dummy_trans(&trans, root->fs_info);
ret = test(&trans);
if (ret)
goto out;
out:
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
return ret;
}
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize)
{
int ret = 0;
test_msg("running raid-stripe-tree tests");
for (int i = 0; i < ARRAY_SIZE(tests); i++) {
ret = run_test(tests[i], sectorsize, nodesize);
if (ret) {
test_err("test-case %ps failed with %d\n", tests[i], ret);
goto out;
}
}
out:
return ret;
}

View File

@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0); WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) { if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list)); BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT( WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
&transaction->delayed_refs.href_root.rb_root));
WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums) if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info, btrfs_err(transaction->fs_info,
@ -349,9 +348,8 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; xa_init(&cur_trans->delayed_refs.head_refs);
xa_init(&cur_trans->delayed_refs.dirty_extents); xa_init(&cur_trans->delayed_refs.dirty_extents);
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/* /*
* although the tree mod log is per file system and not per transaction, * although the tree mod log is per file system and not per transaction,
@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_unlock(&fs_info->trans_lock); spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info); btrfs_cleanup_one_transaction(trans->transaction);
spin_lock(&fs_info->trans_lock); spin_lock(&fs_info->trans_lock);
if (cur_trans == fs_info->running_transaction) if (cur_trans == fs_info->running_transaction)

View File

@ -33,7 +33,7 @@ struct btrfs_path;
*/ */
#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1) #define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
/* Radix-tree tag for roots that are part of the trasaction. */ /* Radix-tree tag for roots that are part of the transaction. */
#define BTRFS_ROOT_TRANS_TAG 0 #define BTRFS_ROOT_TRANS_TAG 0
enum btrfs_trans_state { enum btrfs_trans_state {

View File

@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
return 0; return 0;
} }
int btrfs_verify_level_key(struct extent_buffer *eb, int level, int btrfs_verify_level_key(struct extent_buffer *eb,
struct btrfs_key *first_key, u64 parent_transid) const struct btrfs_tree_parent_check *check)
{ {
struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level; int found_level;
@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
int ret; int ret;
found_level = btrfs_header_level(eb); found_level = btrfs_header_level(eb);
if (found_level != level) { if (found_level != check->level) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree level check failed\n"); KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info, btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", "tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, level, found_level); eb->start, check->level, found_level);
return -EIO; return -EIO;
} }
if (!first_key) if (!check->has_first_key)
return 0; return 0;
/* /*
@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
btrfs_node_key_to_cpu(eb, &found_key, 0); btrfs_node_key_to_cpu(eb, &found_key, 0);
else else
btrfs_item_key_to_cpu(eb, &found_key, 0); btrfs_item_key_to_cpu(eb, &found_key, 0);
ret = btrfs_comp_cpu_keys(first_key, &found_key); ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
if (ret) { if (ret) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree first key check failed\n"); KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info, btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, parent_transid, first_key->objectid, eb->start, check->transid, check->first_key.objectid,
first_key->type, first_key->offset, check->first_key.type, check->first_key.offset,
found_key.objectid, found_key.type, found_key.objectid, found_key.type,
found_key.offset); found_key.offset);
} }

View File

@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf, int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical); struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
int btrfs_verify_level_key(struct extent_buffer *eb, int level, int btrfs_verify_level_key(struct extent_buffer *eb,
struct btrfs_key *first_key, u64 parent_transid); const struct btrfs_tree_parent_check *check);
#endif #endif

View File

@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans, static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *inode,
struct btrfs_path *path, struct btrfs_path *path,
struct btrfs_log_ctx *ctx,
const struct list_head *delayed_del_list, const struct list_head *delayed_del_list,
const struct btrfs_delayed_item *first, const struct btrfs_delayed_item *first,
const struct btrfs_delayed_item **last_ret) const struct btrfs_delayed_item **last_ret)
@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
if (ret < 0) { if (ret < 0) {
return ret; return ret;
} else if (ret == 0) { } else if (ret == 0) {
ret = batch_delete_dir_index_items(trans, inode, path, ctx, ret = batch_delete_dir_index_items(trans, inode, path,
delayed_del_list, curr, delayed_del_list, curr,
&last); &last);
if (ret) if (ret)

View File

@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
* is freed (its refcount is decremented). * is freed (its refcount is decremented).
*/ */
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *eb, struct extent_buffer *eb,
u64 time_seq) u64 time_seq)
{ {

View File

@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op); enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *eb, struct extent_buffer *eb,
u64 time_seq); u64 time_seq);
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);

View File

@ -732,6 +732,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
} }
/*
* We can have very weird soft links passed in.
* One example is "/proc/self/fd/<fd>", which can be a soft link to
* a block device.
*
* But it's never a good idea to use those weird names.
* Here we check if the path (not following symlinks) is a good one inside
* "/dev/".
*/
static bool is_good_dev_path(const char *dev_path)
{
struct path path = { .mnt = NULL, .dentry = NULL };
char *path_buf = NULL;
char *resolved_path;
bool is_good = false;
int ret;
if (!dev_path)
goto out;
path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!path_buf)
goto out;
/*
* Do not follow soft link, just check if the original path is inside
* "/dev/".
*/
ret = kern_path(dev_path, 0, &path);
if (ret)
goto out;
resolved_path = d_path(&path, path_buf, PATH_MAX);
if (IS_ERR(resolved_path))
goto out;
if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
goto out;
is_good = true;
out:
kfree(path_buf);
path_put(&path);
return is_good;
}
static int get_canonical_dev_path(const char *dev_path, char *canonical)
{
struct path path = { .mnt = NULL, .dentry = NULL };
char *path_buf = NULL;
char *resolved_path;
int ret;
if (!dev_path) {
ret = -EINVAL;
goto out;
}
path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!path_buf) {
ret = -ENOMEM;
goto out;
}
ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
if (ret)
goto out;
resolved_path = d_path(&path, path_buf, PATH_MAX);
ret = strscpy(canonical, resolved_path, PATH_MAX);
out:
kfree(path_buf);
path_put(&path);
return ret;
}
static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
struct path old = { .mnt = NULL, .dentry = NULL };
struct path new = { .mnt = NULL, .dentry = NULL };
char *old_path = NULL;
bool is_same = false;
int ret;
if (!device->name)
goto out;
old_path = kzalloc(PATH_MAX, GFP_NOFS);
if (!old_path)
goto out;
rcu_read_lock();
ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
rcu_read_unlock();
if (ret < 0)
goto out;
ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
if (ret)
goto out;
ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
if (ret)
goto out;
if (path_equal(&old, &new))
is_same = true;
out:
kfree(old_path);
path_put(&old);
path_put(&new);
return is_same;
}
/* /*
* Add new device to list of registered devices * Add new device to list of registered devices
* *
@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
MAJOR(path_devt), MINOR(path_devt), MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current)); current->comm, task_pid_nr(current));
} else if (!device->name || strcmp(device->name->str, path)) { } else if (!device->name || !is_same_device(device, path)) {
/* /*
* When FS is already mounted. * When FS is already mounted.
* 1. If you are here and if the device->name is NULL that * 1. If you are here and if the device->name is NULL that
@ -1383,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false; bool new_device_added = false;
struct btrfs_device *device = NULL; struct btrfs_device *device = NULL;
struct file *bdev_file; struct file *bdev_file;
char *canonical_path = NULL;
u64 bytenr; u64 bytenr;
dev_t devt; dev_t devt;
int ret; int ret;
lockdep_assert_held(&uuid_mutex); lockdep_assert_held(&uuid_mutex);
if (!is_good_dev_path(path)) {
canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (canonical_path) {
ret = get_canonical_dev_path(path, canonical_path);
if (ret < 0) {
kfree(canonical_path);
canonical_path = NULL;
}
}
}
/* /*
* Avoid an exclusive open here, as the systemd-udev may initiate the * Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command, * device scan which may race with the user's mount or mkfs command,
@ -1433,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super; goto free_disk_super;
} }
device = device_list_add(path, disk_super, &new_device_added); device = device_list_add(canonical_path ? : path, disk_super,
&new_device_added);
if (!IS_ERR(device) && new_device_added) if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device); btrfs_free_stale_devices(device->devt, device);
@ -1442,6 +1562,7 @@ free_disk_super:
error_bdev_put: error_bdev_put:
fput(bdev_file); fput(bdev_file);
kfree(canonical_path);
return device; return device;
} }
@ -2721,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE); set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) { if (seeding_dev) {
btrfs_clear_sb_rdonly(sb);
/* GFP_KERNEL allocation must not be under device_list_mutex */ /* GFP_KERNEL allocation must not be under device_list_mutex */
seed_devices = btrfs_init_sprout(fs_info); seed_devices = btrfs_init_sprout(fs_info);
if (IS_ERR(seed_devices)) { if (IS_ERR(seed_devices)) {
@ -2865,8 +2984,6 @@ error_sysfs:
mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans: error_trans:
if (seeding_dev)
btrfs_set_sb_rdonly(sb);
if (trans) if (trans)
btrfs_end_transaction(trans); btrfs_end_transaction(trans);
error_free_zone: error_free_zone:
@ -5310,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity, ctl->stripe_size) + ctl->nparity,
@ -5842,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len; return len;
} }
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
map = btrfs_get_chunk_map(fs_info, logical, len);
if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info, static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first, struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing) int dev_replace_is_ongoing)
@ -5920,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror; return preferred_mirror;
} }
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, EXPORT_FOR_TESTS
u64 logical, struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
u16 total_stripes) u64 logical, u16 total_stripes)
{ {
struct btrfs_io_context *bioc; struct btrfs_io_context *bioc;
@ -6481,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
max_len = btrfs_max_io_len(map, map_offset, &io_geom); max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len); *length = min_t(u64, map->chunk_len - map_offset, max_len);
if (dev_replace->replace_task != current)
down_read(&dev_replace->rwsem); down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/* /*
* Hold the semaphore for read during the whole operation, write is * Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait. * requested at commit time but must wait.
*/ */
if (!dev_replace_is_ongoing) if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
up_read(&dev_replace->rwsem); up_read(&dev_replace->rwsem);
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@ -6627,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->mirror_num = io_geom.mirror_num; bioc->mirror_num = io_geom.mirror_num;
out: out:
if (dev_replace_is_ongoing) { if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
lockdep_assert_held(&dev_replace->rwsem); lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */ /* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem); up_read(&dev_replace->rwsem);

View File

@ -306,7 +306,7 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY, BTRFS_NR_READ_POLICY,
}; };
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
/* /*
* Checksum mode - offload it to workqueues or do it synchronously in * Checksum mode - offload it to workqueues or do it synchronously in
* btrfs_submit_chunk(). * btrfs_submit_chunk().
@ -430,7 +430,7 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */ /* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy; enum btrfs_read_policy read_policy;
#ifdef CONFIG_BTRFS_DEBUG #ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Checksum mode - offload it or do it synchronously. */ /* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode; enum btrfs_offload_csum_mode offload_csum_mode;
#endif #endif
@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev); void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical); u64 logical);
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
u64 logical, u16 total_stripes);
#endif
#endif #endif

View File

@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{ {
struct btrfs_dir_item *di = NULL; struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path; struct btrfs_path *path;
size_t name_len = strlen(name); size_t name_len = strlen(name);
int ret = 0; int ret = 0;
@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/ */
ret = 0; ret = 0;
btrfs_assert_tree_write_locked(path->nodes[0]); btrfs_assert_tree_write_locked(path->nodes[0]);
di = btrfs_match_dir_item_name(fs_info, path, name, name_len); di = btrfs_match_dir_item_name(path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) { if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC; ret = -ENOSPC;
goto out; goto out;
} }
} else if (ret == -EEXIST) { } else if (ret == -EEXIST) {
ret = 0; ret = 0;
di = btrfs_match_dir_item_name(fs_info, path, name, name_len); di = btrfs_match_dir_item_name(path, name, name_len);
ASSERT(di); /* logic error */ ASSERT(di); /* logic error */
} else if (ret) { } else if (ret) {
goto out; goto out;

View File

@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
pg_off = offset_in_page(start); pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start); cur_len = btrfs_calc_input_length(orig_end, start);
data_in = kmap_local_folio(in_folio, pg_off); data_in = kmap_local_folio(in_folio, pg_off);
start += PAGE_SIZE; start += cur_len;
workspace->strm.next_in = data_in; workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len; workspace->strm.avail_in = cur_len;
} }

View File

@ -1739,7 +1739,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
return false; return false;
/* /*
* Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
* extent layout the relocation code has. * extent layout the relocation code has.
* Furthermore we have set aside own block-group from which only the * Furthermore we have set aside own block-group from which only the
* relocation "process" can allocate and make sure only one process at a * relocation "process" can allocate and make sure only one process at a
@ -1973,7 +1973,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group->meta_write_pointer > eb->start) if (block_group->meta_write_pointer > eb->start)
return -EBUSY; return -EBUSY;
/* If for_sync, this hole will be filled with trasnsaction commit. */ /* If for_sync, this hole will be filled with transaction commit. */
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
return -EAGAIN; return -EAGAIN;
return -EBUSY; return -EBUSY;

View File

@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next; struct list_head *pos, *next;
ASSERT(timer == &wsm.timer);
spin_lock(&wsm.lock); spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) { if (list_empty(&wsm.lru_list)) {
@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */ /* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) { if (workspace->in_buf.pos == workspace->in_buf.size) {
tot_in += PAGE_SIZE; tot_in += workspace->in_buf.size;
kunmap_local(workspace->in_buf.src); kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL; workspace->in_buf.src = NULL;
folio_put(in_folio); folio_put(in_folio);

View File

@ -37,6 +37,7 @@ enum io_uring_cmd_flags {
/* set when uring wants to cancel a previously issued command */ /* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11), IO_URING_F_CANCEL = (1 << 11),
IO_URING_F_COMPAT = (1 << 12), IO_URING_F_COMPAT = (1 << 12),
IO_URING_F_TASK_DEAD = (1 << 13),
}; };
struct io_wq_work_node { struct io_wq_work_node {

View File

@ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
DECLARE_EVENT_CLASS(btrfs_qgroup_extent, DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info, TP_PROTO(const struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup_extent_record *rec), const struct btrfs_qgroup_extent_record *rec,
u64 bytenr),
TP_ARGS(fs_info, rec), TP_ARGS(fs_info, rec, bytenr),
TP_STRUCT__entry_btrfs( TP_STRUCT__entry_btrfs(
__field( u64, bytenr ) __field( u64, bytenr )
@ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
), ),
TP_fast_assign_btrfs(fs_info, TP_fast_assign_btrfs(fs_info,
__entry->bytenr = rec->bytenr; __entry->bytenr = bytenr;
__entry->num_bytes = rec->num_bytes; __entry->num_bytes = rec->num_bytes;
), ),
@ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
TP_PROTO(const struct btrfs_fs_info *fs_info, TP_PROTO(const struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup_extent_record *rec), const struct btrfs_qgroup_extent_record *rec,
u64 bytenr),
TP_ARGS(fs_info, rec) TP_ARGS(fs_info, rec, bytenr)
); );
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info, TP_PROTO(const struct btrfs_fs_info *fs_info,
const struct btrfs_qgroup_extent_record *rec), const struct btrfs_qgroup_extent_record *rec,
u64 bytenr),
TP_ARGS(fs_info, rec) TP_ARGS(fs_info, rec, bytenr)
); );
TRACE_EVENT(qgroup_num_dirty_extents, TRACE_EVENT(qgroup_num_dirty_extents,
@ -2341,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write); DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);
DECLARE_EVENT_CLASS(btrfs__space_info_update, DECLARE_EVENT_CLASS(btrfs__space_info_update,
@ -2553,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr, TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino), TP_ARGS(fs_info, nr),
TP_STRUCT__entry_btrfs( TP_STRUCT__entry_btrfs(
__field( long, nr_to_scan ) __field( long, nr_to_scan )
@ -2566,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
), ),
TP_fast_assign_btrfs(fs_info, TP_fast_assign_btrfs(fs_info,
__entry->nr_to_scan = nr_to_scan; __entry->nr_to_scan = \
atomic64_read(&fs_info->em_shrinker_nr_to_scan);
__entry->nr = nr; __entry->nr = nr;
__entry->last_root_id = last_root_id; __entry->last_root_id = fs_info->em_shrinker_last_root;
__entry->last_ino = last_ino; __entry->last_ino = fs_info->em_shrinker_last_ino;
), ),
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@ -2579,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr, TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino), TP_ARGS(fs_info, nr_dropped, nr),
TP_STRUCT__entry_btrfs( TP_STRUCT__entry_btrfs(
__field( long, nr_dropped ) __field( long, nr_dropped )
@ -2594,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_fast_assign_btrfs(fs_info, TP_fast_assign_btrfs(fs_info,
__entry->nr_dropped = nr_dropped; __entry->nr_dropped = nr_dropped;
__entry->nr = nr; __entry->nr = nr;
__entry->last_root_id = last_root_id; __entry->last_root_id = fs_info->em_shrinker_last_root;
__entry->last_ino = last_ino; __entry->last_ino = fs_info->em_shrinker_last_ino;
), ),
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",

View File

@ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args {
#define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0 #define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0
#define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1 #define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1
/*
* Wait for subvolume cleaning process. This queries the kernel queue and it
* can change between the calls.
*
* - FOR_ONE - specify the subvolid
* - FOR_QUEUED - wait for all currently queued
* - COUNT - count number of queued
* - PEEK_FIRST - read which is the first in the queue (to be cleaned or being
* cleaned already), or 0 if the queue is empty
* - PEEK_LAST - read the last subvolid in the queue, or 0 if the queue is empty
*/
struct btrfs_ioctl_subvol_wait {
__u64 subvolid;
__u32 mode;
__u32 count;
};
#define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE (0)
#define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED (1)
#define BTRFS_SUBVOL_SYNC_COUNT (2)
#define BTRFS_SUBVOL_SYNC_PEEK_FIRST (3)
#define BTRFS_SUBVOL_SYNC_PEEK_LAST (4)
/* Error codes as returned by the kernel */ /* Error codes as returned by the kernel */
enum btrfs_err_code { enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@ -1181,6 +1204,8 @@ enum btrfs_err_code {
struct btrfs_ioctl_encoded_io_args) struct btrfs_ioctl_encoded_io_args)
#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
struct btrfs_ioctl_encoded_io_args) struct btrfs_ioctl_encoded_io_args)
#define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
struct btrfs_ioctl_subvol_wait)
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
if (current->flags & (PF_EXITING | PF_KTHREAD))
flags |= IO_URING_F_TASK_DEAD;
/* task_work executor checks the deffered list completion */ /* task_work executor checks the deffered list completion */
ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); ioucmd->task_work_cb(ioucmd, flags);
} }
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,