2022-09-09 21:53:21 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <trace/events/btrfs.h>
|
2022-10-19 14:50:49 +00:00
|
|
|
#include "messages.h"
|
2022-09-09 21:53:21 +00:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "extent-io-tree.h"
|
2022-09-09 21:53:25 +00:00
|
|
|
#include "btrfs_inode.h"
|
2022-09-09 21:53:28 +00:00
|
|
|
#include "misc.h"
|
2022-09-09 21:53:21 +00:00
|
|
|
|
|
|
|
static struct kmem_cache *extent_state_cache;
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
static inline bool extent_state_in_tree(const struct extent_state *state)
|
|
|
|
{
|
|
|
|
return !RB_EMPTY_NODE(&state->rb_node);
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:21 +00:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
static LIST_HEAD(states);
|
|
|
|
static DEFINE_SPINLOCK(leak_lock);
|
|
|
|
|
|
|
|
static inline void btrfs_leak_debug_add_state(struct extent_state *state)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&leak_lock, flags);
|
|
|
|
list_add(&state->leak_list, &states);
|
|
|
|
spin_unlock_irqrestore(&leak_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_leak_debug_del_state(struct extent_state *state)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&leak_lock, flags);
|
|
|
|
list_del(&state->leak_list);
|
|
|
|
spin_unlock_irqrestore(&leak_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_extent_state_leak_debug_check(void)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
|
|
|
|
while (!list_empty(&states)) {
|
|
|
|
state = list_entry(states.next, struct extent_state, leak_list);
|
|
|
|
pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
|
|
|
|
state->start, state->end, state->state,
|
|
|
|
extent_state_in_tree(state),
|
|
|
|
refcount_read(&state->refs));
|
|
|
|
list_del(&state->leak_list);
|
|
|
|
kmem_cache_free(extent_state_cache, state);
|
|
|
|
}
|
|
|
|
}
|
2022-09-09 21:53:25 +00:00
|
|
|
|
2022-09-09 21:53:30 +00:00
|
|
|
#define btrfs_debug_check_extent_io_range(tree, start, end) \
|
|
|
|
__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
|
|
|
|
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
|
|
|
|
struct extent_io_tree *tree,
|
|
|
|
u64 start, u64 end)
|
2022-09-09 21:53:25 +00:00
|
|
|
{
|
2022-10-28 00:55:51 +00:00
|
|
|
struct btrfs_inode *inode = tree->inode;
|
2022-09-09 21:53:25 +00:00
|
|
|
u64 isize;
|
|
|
|
|
2022-09-09 21:53:49 +00:00
|
|
|
if (!inode)
|
2022-09-09 21:53:25 +00:00
|
|
|
return;
|
|
|
|
|
2022-10-28 00:55:51 +00:00
|
|
|
isize = i_size_read(&inode->vfs_inode);
|
2022-09-09 21:53:25 +00:00
|
|
|
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
|
2022-10-28 00:55:51 +00:00
|
|
|
btrfs_debug_rl(inode->root->fs_info,
|
2022-09-09 21:53:25 +00:00
|
|
|
"%s: ino %llu isize %llu odd range [%llu,%llu]",
|
2022-10-28 00:55:51 +00:00
|
|
|
caller, btrfs_ino(inode), isize, start, end);
|
2022-09-09 21:53:25 +00:00
|
|
|
}
|
|
|
|
}
|
2022-09-09 21:53:21 +00:00
|
|
|
#else
|
|
|
|
#define btrfs_leak_debug_add_state(state) do {} while (0)
|
|
|
|
#define btrfs_leak_debug_del_state(state) do {} while (0)
|
|
|
|
#define btrfs_extent_state_leak_debug_check() do {} while (0)
|
2022-09-09 21:53:30 +00:00
|
|
|
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
|
2022-09-09 21:53:21 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For the file_extent_tree, we want to hold the inode lock when we lookup and
|
|
|
|
* update the disk_i_size, but lockdep will complain because our io_tree we hold
|
|
|
|
* the tree lock and get the inode lock when setting delalloc. These two things
|
|
|
|
* are unrelated, so make a class for the file_extent_tree so we don't get the
|
|
|
|
* two locking patterns mixed up.
|
|
|
|
*/
|
|
|
|
static struct lock_class_key file_extent_tree_class;
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
struct tree_entry {
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
|
|
|
struct rb_node rb_node;
|
|
|
|
};
|
|
|
|
|
2022-09-09 21:53:21 +00:00
|
|
|
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
|
2022-10-28 00:47:06 +00:00
|
|
|
struct extent_io_tree *tree, unsigned int owner)
|
2022-09-09 21:53:21 +00:00
|
|
|
{
|
|
|
|
tree->fs_info = fs_info;
|
|
|
|
tree->state = RB_ROOT;
|
|
|
|
spin_lock_init(&tree->lock);
|
2022-10-28 00:55:51 +00:00
|
|
|
tree->inode = NULL;
|
2022-09-09 21:53:21 +00:00
|
|
|
tree->owner = owner;
|
|
|
|
if (owner == IO_TREE_INODE_FILE_EXTENT)
|
|
|
|
lockdep_set_class(&tree->lock, &file_extent_tree_class);
|
|
|
|
}
|
|
|
|
|
2023-09-22 10:39:03 +00:00
|
|
|
/*
|
|
|
|
* Empty an io tree, removing and freeing every extent state record from the
|
|
|
|
* tree. This should be called once we are sure no other task can access the
|
|
|
|
* tree anymore, so no tree updates happen after we empty the tree and there
|
|
|
|
* aren't any waiters on any extent state record (EXTENT_LOCKED bit is never
|
|
|
|
* set on any extent state when calling this function).
|
|
|
|
*/
|
2022-09-09 21:53:21 +00:00
|
|
|
void extent_io_tree_release(struct extent_io_tree *tree)
|
|
|
|
{
|
2023-09-22 10:39:07 +00:00
|
|
|
struct rb_root root;
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *tmp;
|
2022-09-09 21:53:21 +00:00
|
|
|
|
2023-09-22 10:39:07 +00:00
|
|
|
spin_lock(&tree->lock);
|
|
|
|
root = tree->state;
|
|
|
|
tree->state = RB_ROOT;
|
|
|
|
rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) {
|
|
|
|
/* Clear node to keep free_extent_state() happy. */
|
2022-09-09 21:53:21 +00:00
|
|
|
RB_CLEAR_NODE(&state->rb_node);
|
2023-09-22 10:39:03 +00:00
|
|
|
ASSERT(!(state->state & EXTENT_LOCKED));
|
2023-09-22 10:39:05 +00:00
|
|
|
/*
|
|
|
|
* No need for a memory barrier here, as we are holding the tree
|
|
|
|
* lock and we only change the waitqueue while holding that lock
|
2023-09-22 10:39:06 +00:00
|
|
|
* (see wait_extent_bit()).
|
2023-09-22 10:39:05 +00:00
|
|
|
*/
|
2022-09-09 21:53:21 +00:00
|
|
|
ASSERT(!waitqueue_active(&state->wq));
|
|
|
|
free_extent_state(state);
|
|
|
|
cond_resched_lock(&tree->lock);
|
|
|
|
}
|
2023-09-22 10:39:07 +00:00
|
|
|
/*
|
|
|
|
* Should still be empty even after a reschedule, no other task should
|
|
|
|
* be accessing the tree anymore.
|
|
|
|
*/
|
|
|
|
ASSERT(RB_EMPTY_ROOT(&tree->state));
|
2022-09-09 21:53:21 +00:00
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
static struct extent_state *alloc_extent_state(gfp_t mask)
|
2022-09-09 21:53:21 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The given mask might be not appropriate for the slab allocator,
|
|
|
|
* drop the unsupported bits
|
|
|
|
*/
|
|
|
|
mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
|
|
|
|
state = kmem_cache_alloc(extent_state_cache, mask);
|
|
|
|
if (!state)
|
|
|
|
return state;
|
|
|
|
state->state = 0;
|
|
|
|
RB_CLEAR_NODE(&state->rb_node);
|
|
|
|
btrfs_leak_debug_add_state(state);
|
|
|
|
refcount_set(&state->refs, 1);
|
|
|
|
init_waitqueue_head(&state->wq);
|
|
|
|
trace_alloc_extent_state(state, mask, _RET_IP_);
|
|
|
|
return state;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
static struct extent_state *alloc_extent_state_atomic(struct extent_state *prealloc)
|
2022-09-09 21:53:21 +00:00
|
|
|
{
|
|
|
|
if (!prealloc)
|
|
|
|
prealloc = alloc_extent_state(GFP_ATOMIC);
|
|
|
|
|
|
|
|
return prealloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_extent_state(struct extent_state *state)
|
|
|
|
{
|
|
|
|
if (!state)
|
|
|
|
return;
|
|
|
|
if (refcount_dec_and_test(&state->refs)) {
|
|
|
|
WARN_ON(extent_state_in_tree(state));
|
|
|
|
btrfs_leak_debug_del_state(state);
|
|
|
|
trace_free_extent_state(state, _RET_IP_);
|
|
|
|
kmem_cache_free(extent_state_cache, state);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:27 +00:00
|
|
|
static int add_extent_changeset(struct extent_state *state, u32 bits,
|
|
|
|
struct extent_changeset *changeset,
|
|
|
|
int set)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!changeset)
|
|
|
|
return 0;
|
|
|
|
if (set && (state->state & bits) == bits)
|
|
|
|
return 0;
|
|
|
|
if (!set && (state->state & bits) == 0)
|
|
|
|
return 0;
|
|
|
|
changeset->bytes_changed += state->end - state->start + 1;
|
|
|
|
ret = ulist_add(&changeset->range_changed, state->start, state->end,
|
|
|
|
GFP_ATOMIC);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
static inline struct extent_state *next_state(struct extent_state *state)
|
|
|
|
{
|
|
|
|
struct rb_node *next = rb_next(&state->rb_node);
|
|
|
|
|
|
|
|
if (next)
|
|
|
|
return rb_entry(next, struct extent_state, rb_node);
|
|
|
|
else
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:36 +00:00
|
|
|
static inline struct extent_state *prev_state(struct extent_state *state)
|
|
|
|
{
|
|
|
|
struct rb_node *next = rb_prev(&state->rb_node);
|
|
|
|
|
|
|
|
if (next)
|
|
|
|
return rb_entry(next, struct extent_state, rb_node);
|
|
|
|
else
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:26 +00:00
|
|
|
/*
|
|
|
|
* Search @tree for an entry that contains @offset. Such entry would have
|
|
|
|
* entry->start <= offset && entry->end >= offset.
|
|
|
|
*
|
|
|
|
* @tree: the tree to search
|
|
|
|
* @offset: offset that should fall within an entry in @tree
|
|
|
|
* @node_ret: pointer where new node should be anchored (used when inserting an
|
|
|
|
* entry in the tree)
|
|
|
|
* @parent_ret: points to entry which would have been the parent of the entry,
|
|
|
|
* containing @offset
|
|
|
|
*
|
|
|
|
* Return a pointer to the entry that contains @offset byte address and don't change
|
|
|
|
* @node_ret and @parent_ret.
|
|
|
|
*
|
|
|
|
* If no such entry exists, return pointer to entry that ends before @offset
|
|
|
|
* and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
|
|
|
|
*/
|
2022-09-09 21:53:35 +00:00
|
|
|
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
|
|
|
|
u64 offset,
|
|
|
|
struct rb_node ***node_ret,
|
|
|
|
struct rb_node **parent_ret)
|
2022-09-09 21:53:26 +00:00
|
|
|
{
|
|
|
|
struct rb_root *root = &tree->state;
|
|
|
|
struct rb_node **node = &root->rb_node;
|
|
|
|
struct rb_node *prev = NULL;
|
2022-09-09 21:53:35 +00:00
|
|
|
struct extent_state *entry = NULL;
|
2022-09-09 21:53:26 +00:00
|
|
|
|
|
|
|
while (*node) {
|
|
|
|
prev = *node;
|
2022-09-09 21:53:32 +00:00
|
|
|
entry = rb_entry(prev, struct extent_state, rb_node);
|
2022-09-09 21:53:26 +00:00
|
|
|
|
|
|
|
if (offset < entry->start)
|
|
|
|
node = &(*node)->rb_left;
|
|
|
|
else if (offset > entry->end)
|
|
|
|
node = &(*node)->rb_right;
|
|
|
|
else
|
2022-09-09 21:53:35 +00:00
|
|
|
return entry;
|
2022-09-09 21:53:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (node_ret)
|
|
|
|
*node_ret = node;
|
|
|
|
if (parent_ret)
|
|
|
|
*parent_ret = prev;
|
|
|
|
|
|
|
|
/* Search neighbors until we find the first one past the end */
|
2022-09-09 21:53:35 +00:00
|
|
|
while (entry && offset > entry->end)
|
|
|
|
entry = next_state(entry);
|
2022-09-09 21:53:26 +00:00
|
|
|
|
2022-09-09 21:53:35 +00:00
|
|
|
return entry;
|
2022-09-09 21:53:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search offset in the tree or fill neighbor rbtree node pointers.
|
|
|
|
*
|
|
|
|
* @tree: the tree to search
|
|
|
|
* @offset: offset that should fall within an entry in @tree
|
|
|
|
* @next_ret: pointer to the first entry whose range ends after @offset
|
|
|
|
* @prev_ret: pointer to the first entry whose range begins before @offset
|
|
|
|
*
|
|
|
|
* Return a pointer to the entry that contains @offset byte address. If no
|
|
|
|
* such entry exists, then return NULL and fill @prev_ret and @next_ret.
|
|
|
|
* Otherwise return the found entry and other pointers are left untouched.
|
|
|
|
*/
|
2022-09-09 21:53:36 +00:00
|
|
|
static struct extent_state *tree_search_prev_next(struct extent_io_tree *tree,
|
|
|
|
u64 offset,
|
|
|
|
struct extent_state **prev_ret,
|
|
|
|
struct extent_state **next_ret)
|
2022-09-09 21:53:26 +00:00
|
|
|
{
|
|
|
|
struct rb_root *root = &tree->state;
|
|
|
|
struct rb_node **node = &root->rb_node;
|
2022-09-09 21:53:36 +00:00
|
|
|
struct extent_state *orig_prev;
|
|
|
|
struct extent_state *entry = NULL;
|
2022-09-09 21:53:26 +00:00
|
|
|
|
|
|
|
ASSERT(prev_ret);
|
|
|
|
ASSERT(next_ret);
|
|
|
|
|
|
|
|
while (*node) {
|
2022-09-09 21:53:36 +00:00
|
|
|
entry = rb_entry(*node, struct extent_state, rb_node);
|
2022-09-09 21:53:26 +00:00
|
|
|
|
|
|
|
if (offset < entry->start)
|
|
|
|
node = &(*node)->rb_left;
|
|
|
|
else if (offset > entry->end)
|
|
|
|
node = &(*node)->rb_right;
|
|
|
|
else
|
2022-09-09 21:53:36 +00:00
|
|
|
return entry;
|
2022-09-09 21:53:26 +00:00
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:36 +00:00
|
|
|
orig_prev = entry;
|
|
|
|
while (entry && offset > entry->end)
|
|
|
|
entry = next_state(entry);
|
|
|
|
*next_ret = entry;
|
|
|
|
entry = orig_prev;
|
2022-09-09 21:53:26 +00:00
|
|
|
|
2022-09-09 21:53:36 +00:00
|
|
|
while (entry && offset < entry->start)
|
|
|
|
entry = prev_state(entry);
|
|
|
|
*prev_ret = entry;
|
2022-09-09 21:53:26 +00:00
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* Inexact rb-tree search, return the next entry if @offset is not found
|
|
|
|
*/
|
2022-09-09 21:53:34 +00:00
|
|
|
static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 offset)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
2022-09-09 21:53:35 +00:00
|
|
|
return tree_search_for_insert(tree, offset, NULL, NULL);
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
|
|
|
|
{
|
|
|
|
btrfs_panic(tree->fs_info, err,
|
|
|
|
"locking error: extent tree was modified by another thread while locked");
|
|
|
|
}
|
|
|
|
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
|
|
|
|
{
|
|
|
|
struct extent_state *prev;
|
|
|
|
|
|
|
|
prev = prev_state(state);
|
|
|
|
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
|
|
|
|
if (tree->inode)
|
|
|
|
btrfs_merge_delalloc_extent(tree->inode, state, prev);
|
|
|
|
state->start = prev->start;
|
|
|
|
rb_erase(&prev->rb_node, &tree->state);
|
|
|
|
RB_CLEAR_NODE(&prev->rb_node);
|
|
|
|
free_extent_state(prev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state)
|
|
|
|
{
|
|
|
|
struct extent_state *next;
|
|
|
|
|
|
|
|
next = next_state(state);
|
|
|
|
if (next && next->start == state->end + 1 && next->state == state->state) {
|
|
|
|
if (tree->inode)
|
|
|
|
btrfs_merge_delalloc_extent(tree->inode, state, next);
|
|
|
|
state->end = next->end;
|
|
|
|
rb_erase(&next->rb_node, &tree->state);
|
|
|
|
RB_CLEAR_NODE(&next->rb_node);
|
|
|
|
free_extent_state(next);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:27 +00:00
|
|
|
/*
|
|
|
|
* Utility function to look for merge candidates inside a given range. Any
|
|
|
|
* extents with matching state are merged together into a single extent in the
|
|
|
|
* tree. Extents with EXTENT_IO in their state field are not merged because
|
|
|
|
* the end_io handlers need to be able to do operations on them without
|
|
|
|
* sleeping (or doing allocations/splits).
|
|
|
|
*
|
|
|
|
* This should be called with the tree lock held.
|
|
|
|
*/
|
2022-09-09 21:53:31 +00:00
|
|
|
static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
|
|
|
|
return;
|
|
|
|
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
merge_prev_state(tree, state);
|
|
|
|
merge_next_state(tree, state);
|
2022-09-09 21:53:27 +00:00
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:31 +00:00
|
|
|
static void set_state_bits(struct extent_io_tree *tree,
|
|
|
|
struct extent_state *state,
|
|
|
|
u32 bits, struct extent_changeset *changeset)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
|
|
|
|
int ret;
|
|
|
|
|
2022-10-28 00:55:51 +00:00
|
|
|
if (tree->inode)
|
2022-10-27 00:41:32 +00:00
|
|
|
btrfs_set_delalloc_extent(tree->inode, state, bits);
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
|
|
|
|
BUG_ON(ret < 0);
|
|
|
|
state->state |= bits_to_set;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert an extent_state struct into the tree. 'bits' are set on the
|
|
|
|
* struct before it is inserted.
|
|
|
|
*
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
* Returns a pointer to the struct extent_state record containing the range
|
|
|
|
* requested for insertion, which may be the same as the given struct or it
|
|
|
|
* may be an existing record in the tree that was expanded to accommodate the
|
|
|
|
* requested range. In case of an extent_state different from the one that was
|
|
|
|
* given, the later can be freed or reused by the caller.
|
|
|
|
*
|
|
|
|
* On error it returns an error pointer.
|
2022-09-09 21:53:27 +00:00
|
|
|
*
|
|
|
|
* The tree lock is not taken internally. This is a utility function and
|
|
|
|
* probably isn't what you want to call (see set/clear_extent_bit).
|
|
|
|
*/
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
static struct extent_state *insert_state(struct extent_io_tree *tree,
|
|
|
|
struct extent_state *state,
|
|
|
|
u32 bits,
|
|
|
|
struct extent_changeset *changeset)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
struct rb_node **node;
|
2022-11-18 20:06:09 +00:00
|
|
|
struct rb_node *parent = NULL;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
const u64 start = state->start - 1;
|
|
|
|
const u64 end = state->end + 1;
|
|
|
|
const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY));
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
set_state_bits(tree, state, bits, changeset);
|
|
|
|
|
|
|
|
node = &tree->state.rb_node;
|
|
|
|
while (*node) {
|
2022-09-09 21:53:32 +00:00
|
|
|
struct extent_state *entry;
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
parent = *node;
|
2022-09-09 21:53:32 +00:00
|
|
|
entry = rb_entry(parent, struct extent_state, rb_node);
|
2022-09-09 21:53:27 +00:00
|
|
|
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
if (state->end < entry->start) {
|
|
|
|
if (try_merge && end == entry->start &&
|
|
|
|
state->state == entry->state) {
|
|
|
|
if (tree->inode)
|
|
|
|
btrfs_merge_delalloc_extent(tree->inode,
|
|
|
|
state, entry);
|
|
|
|
entry->start = state->start;
|
|
|
|
merge_prev_state(tree, entry);
|
|
|
|
state->state = 0;
|
|
|
|
return entry;
|
|
|
|
}
|
2022-09-09 21:53:27 +00:00
|
|
|
node = &(*node)->rb_left;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
} else if (state->end > entry->end) {
|
|
|
|
if (try_merge && entry->end == start &&
|
|
|
|
state->state == entry->state) {
|
|
|
|
if (tree->inode)
|
|
|
|
btrfs_merge_delalloc_extent(tree->inode,
|
|
|
|
state, entry);
|
|
|
|
entry->end = state->end;
|
|
|
|
merge_next_state(tree, entry);
|
|
|
|
state->state = 0;
|
|
|
|
return entry;
|
|
|
|
}
|
2022-09-09 21:53:27 +00:00
|
|
|
node = &(*node)->rb_right;
|
|
|
|
} else {
|
|
|
|
btrfs_err(tree->fs_info,
|
|
|
|
"found node %llu %llu on insert of %llu %llu",
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
entry->start, entry->end, state->start, state->end);
|
|
|
|
return ERR_PTR(-EEXIST);
|
2022-09-09 21:53:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&state->rb_node, parent, node);
|
|
|
|
rb_insert_color(&state->rb_node, &tree->state);
|
|
|
|
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
return state;
|
2022-09-09 21:53:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert state to @tree to the location given by @node and @parent.
|
|
|
|
*/
|
2022-09-09 21:53:31 +00:00
|
|
|
static void insert_state_fast(struct extent_io_tree *tree,
|
|
|
|
struct extent_state *state, struct rb_node **node,
|
|
|
|
struct rb_node *parent, unsigned bits,
|
|
|
|
struct extent_changeset *changeset)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
set_state_bits(tree, state, bits, changeset);
|
|
|
|
rb_link_node(&state->rb_node, parent, node);
|
|
|
|
rb_insert_color(&state->rb_node, &tree->state);
|
|
|
|
merge_state(tree, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Split a given extent state struct in two, inserting the preallocated
|
|
|
|
* struct 'prealloc' as the newly created second half. 'split' indicates an
|
|
|
|
* offset inside 'orig' where it should be split.
|
|
|
|
*
|
|
|
|
* Before calling,
|
|
|
|
* the tree has 'orig' at [orig->start, orig->end]. After calling, there
|
|
|
|
* are two extent state structs in the tree:
|
|
|
|
* prealloc: [orig->start, split - 1]
|
|
|
|
* orig: [ split, orig->end ]
|
|
|
|
*
|
|
|
|
* The tree locks are not taken by this function. They need to be held
|
|
|
|
* by the caller.
|
|
|
|
*/
|
2022-09-09 21:53:31 +00:00
|
|
|
static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
|
|
|
|
struct extent_state *prealloc, u64 split)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct rb_node **node;
|
|
|
|
|
2022-10-28 00:55:51 +00:00
|
|
|
if (tree->inode)
|
2022-10-27 00:41:32 +00:00
|
|
|
btrfs_split_delalloc_extent(tree->inode, orig, split);
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
prealloc->start = orig->start;
|
|
|
|
prealloc->end = split - 1;
|
|
|
|
prealloc->state = orig->state;
|
|
|
|
orig->start = split;
|
|
|
|
|
|
|
|
parent = &orig->rb_node;
|
|
|
|
node = &parent;
|
|
|
|
while (*node) {
|
2022-09-09 21:53:32 +00:00
|
|
|
struct extent_state *entry;
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
parent = *node;
|
2022-09-09 21:53:32 +00:00
|
|
|
entry = rb_entry(parent, struct extent_state, rb_node);
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
if (prealloc->end < entry->start) {
|
|
|
|
node = &(*node)->rb_left;
|
|
|
|
} else if (prealloc->end > entry->end) {
|
|
|
|
node = &(*node)->rb_right;
|
|
|
|
} else {
|
|
|
|
free_extent_state(prealloc);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&prealloc->rb_node, parent, node);
|
|
|
|
rb_insert_color(&prealloc->rb_node, &tree->state);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Utility function to clear some bits in an extent state struct. It will
|
|
|
|
* optionally wake up anyone waiting on this state (wake == 1).
|
|
|
|
*
|
|
|
|
* If no bits are set on the state struct after clearing things, the
|
|
|
|
* struct is freed and removed from the tree
|
|
|
|
*/
|
2022-09-09 21:53:31 +00:00
|
|
|
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
|
|
|
|
struct extent_state *state,
|
|
|
|
u32 bits, int wake,
|
|
|
|
struct extent_changeset *changeset)
|
2022-09-09 21:53:27 +00:00
|
|
|
{
|
|
|
|
struct extent_state *next;
|
|
|
|
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
|
|
|
|
int ret;
|
|
|
|
|
2022-10-28 00:55:51 +00:00
|
|
|
if (tree->inode)
|
2022-10-27 00:41:32 +00:00
|
|
|
btrfs_clear_delalloc_extent(tree->inode, state, bits);
|
2022-09-09 21:53:27 +00:00
|
|
|
|
|
|
|
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
|
|
|
|
BUG_ON(ret < 0);
|
|
|
|
state->state &= ~bits_to_clear;
|
|
|
|
if (wake)
|
|
|
|
wake_up(&state->wq);
|
|
|
|
if (state->state == 0) {
|
|
|
|
next = next_state(state);
|
|
|
|
if (extent_state_in_tree(state)) {
|
|
|
|
rb_erase(&state->rb_node, &tree->state);
|
|
|
|
RB_CLEAR_NODE(&state->rb_node);
|
|
|
|
free_extent_state(state);
|
|
|
|
} else {
|
|
|
|
WARN_ON(1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
merge_state(tree, state);
|
|
|
|
next = next_state(state);
|
|
|
|
}
|
|
|
|
return next;
|
|
|
|
}
|
|
|
|
|
2023-05-24 23:04:37 +00:00
|
|
|
/*
|
|
|
|
* Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly,
|
|
|
|
* unset the EXTENT_NOWAIT bit.
|
|
|
|
*/
|
|
|
|
static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
|
|
|
|
{
|
|
|
|
*mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS);
|
|
|
|
*bits &= EXTENT_NOWAIT - 1;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* Clear some bits on a range in the tree. This may require splitting or
|
|
|
|
* inserting elements in the tree, so the gfp mask is used to indicate which
|
|
|
|
* allocations or sleeping are allowed.
|
|
|
|
*
|
|
|
|
* Pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove the given
|
|
|
|
* range from the tree regardless of state (ie for truncate).
|
|
|
|
*
|
|
|
|
* The range [start, end] is inclusive.
|
|
|
|
*
|
|
|
|
* This takes the tree lock, and returns 0 on success and < 0 on error.
|
|
|
|
*/
|
|
|
|
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2022-09-09 21:53:47 +00:00
|
|
|
u32 bits, struct extent_state **cached_state,
|
2023-05-24 23:04:39 +00:00
|
|
|
struct extent_changeset *changeset)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *cached;
|
|
|
|
struct extent_state *prealloc = NULL;
|
|
|
|
u64 last_end;
|
|
|
|
int err;
|
|
|
|
int clear = 0;
|
2022-09-09 21:53:47 +00:00
|
|
|
int wake;
|
|
|
|
int delete = (bits & EXTENT_CLEAR_ALL_BITS);
|
2023-05-24 23:04:39 +00:00
|
|
|
gfp_t mask;
|
2022-09-09 21:53:29 +00:00
|
|
|
|
2023-05-24 23:04:37 +00:00
|
|
|
set_gfp_mask_from_bits(&bits, &mask);
|
2022-09-09 21:53:29 +00:00
|
|
|
btrfs_debug_check_extent_io_range(tree, start, end);
|
|
|
|
trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
|
|
|
|
|
|
|
|
if (delete)
|
|
|
|
bits |= ~EXTENT_CTLBITS;
|
|
|
|
|
2022-09-09 21:53:47 +00:00
|
|
|
if (bits & EXTENT_DELALLOC)
|
|
|
|
bits |= EXTENT_NORESERVE;
|
|
|
|
|
|
|
|
wake = (bits & EXTENT_LOCKED) ? 1 : 0;
|
2022-09-09 21:53:29 +00:00
|
|
|
if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
|
|
|
|
clear = 1;
|
|
|
|
again:
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc) {
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* Don't care for allocation failure here because we might end
|
|
|
|
* up not needing the pre-allocated extent state at all, which
|
|
|
|
* is the case if we only have in the tree extent states that
|
|
|
|
* cover our input range and don't cover too any other range.
|
|
|
|
* If we end up needing a new extent state we allocate it later.
|
|
|
|
*/
|
|
|
|
prealloc = alloc_extent_state(mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
if (cached_state) {
|
|
|
|
cached = *cached_state;
|
|
|
|
|
|
|
|
if (clear) {
|
|
|
|
*cached_state = NULL;
|
|
|
|
cached_state = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cached && extent_state_in_tree(cached) &&
|
|
|
|
cached->start <= start && cached->end > start) {
|
|
|
|
if (clear)
|
|
|
|
refcount_dec(&cached->refs);
|
|
|
|
state = cached;
|
|
|
|
goto hit_next;
|
|
|
|
}
|
|
|
|
if (clear)
|
|
|
|
free_extent_state(cached);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This search will find the extents that end after our range starts. */
|
2022-09-09 21:53:34 +00:00
|
|
|
state = tree_search(tree, start);
|
|
|
|
if (!state)
|
2022-09-09 21:53:29 +00:00
|
|
|
goto out;
|
|
|
|
hit_next:
|
|
|
|
if (state->start > end)
|
|
|
|
goto out;
|
|
|
|
WARN_ON(state->end < start);
|
|
|
|
last_end = state->end;
|
|
|
|
|
|
|
|
/* The state doesn't have the wanted bits, go ahead. */
|
|
|
|
if (!(state->state & bits)) {
|
|
|
|
state = next_state(state);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state | or
|
|
|
|
* | ------------- state -------------- |
|
|
|
|
*
|
|
|
|
* We need to split the extent we found, and may flip bits on second
|
|
|
|
* half.
|
|
|
|
*
|
|
|
|
* If the extent we found extends past our range, we just split and
|
|
|
|
* search again. It'll get split again the next time though.
|
|
|
|
*
|
|
|
|
* If the extent we found is inside our range, we clear the desired bit
|
|
|
|
* on it.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (state->start < start) {
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
err = split_state(tree, state, prealloc, start);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
|
|
|
|
prealloc = NULL;
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
if (state->end <= end) {
|
|
|
|
state = clear_state_bit(tree, state, bits, wake, changeset);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
* We need to split the extent, and clear the bit on the first half.
|
|
|
|
*/
|
|
|
|
if (state->start <= end && state->end > end) {
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
err = split_state(tree, state, prealloc, end + 1);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
|
|
|
|
if (wake)
|
|
|
|
wake_up(&state->wq);
|
|
|
|
|
|
|
|
clear_state_bit(tree, prealloc, bits, wake, changeset);
|
|
|
|
|
|
|
|
prealloc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
state = clear_state_bit(tree, state, bits, wake, changeset);
|
|
|
|
next:
|
|
|
|
if (last_end == (u64)-1)
|
|
|
|
goto out;
|
|
|
|
start = last_end + 1;
|
|
|
|
if (start <= end && state && !need_resched())
|
|
|
|
goto hit_next;
|
|
|
|
|
|
|
|
search_again:
|
|
|
|
if (start > end)
|
|
|
|
goto out;
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
if (gfpflags_allow_blocking(mask))
|
|
|
|
cond_resched();
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
if (prealloc)
|
|
|
|
free_extent_state(prealloc);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait for one or more bits to clear on a range in the state tree.
|
|
|
|
* The range [start, end] is inclusive.
|
|
|
|
* The tree lock is taken by this function
|
|
|
|
*/
|
2023-09-22 10:39:04 +00:00
|
|
|
static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
u32 bits, struct extent_state **cached_state)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
|
|
|
|
btrfs_debug_check_extent_io_range(tree, start, end);
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
again:
|
2022-09-30 20:45:12 +00:00
|
|
|
/*
|
|
|
|
* Maintain cached_state, as we may not remove it from the tree if there
|
|
|
|
* are more bits than the bits we're waiting on set on this state.
|
|
|
|
*/
|
|
|
|
if (cached_state && *cached_state) {
|
|
|
|
state = *cached_state;
|
|
|
|
if (extent_state_in_tree(state) &&
|
|
|
|
state->start <= start && start < state->end)
|
|
|
|
goto process_node;
|
|
|
|
}
|
2022-09-09 21:53:29 +00:00
|
|
|
while (1) {
|
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our
|
|
|
|
* range starts.
|
|
|
|
*/
|
2022-09-09 21:53:34 +00:00
|
|
|
state = tree_search(tree, start);
|
2022-09-09 21:53:33 +00:00
|
|
|
process_node:
|
2022-09-09 21:53:34 +00:00
|
|
|
if (!state)
|
|
|
|
break;
|
2022-09-09 21:53:29 +00:00
|
|
|
if (state->start > end)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (state->state & bits) {
|
2023-09-22 10:39:06 +00:00
|
|
|
DEFINE_WAIT(wait);
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
start = state->start;
|
|
|
|
refcount_inc(&state->refs);
|
2023-09-22 10:39:06 +00:00
|
|
|
prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
schedule();
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
finish_wait(&state->wq, &wait);
|
2022-09-09 21:53:29 +00:00
|
|
|
free_extent_state(state);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
start = state->end + 1;
|
|
|
|
|
|
|
|
if (start > end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!cond_resched_lock(&tree->lock)) {
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:29 +00:00
|
|
|
goto process_node;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
2022-09-30 20:45:12 +00:00
|
|
|
/* This state is no longer useful, clear it and free it up. */
|
|
|
|
if (cached_state && *cached_state) {
|
|
|
|
state = *cached_state;
|
|
|
|
*cached_state = NULL;
|
|
|
|
free_extent_state(state);
|
|
|
|
}
|
2022-09-09 21:53:29 +00:00
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cache_state_if_flags(struct extent_state *state,
|
|
|
|
struct extent_state **cached_ptr,
|
|
|
|
unsigned flags)
|
|
|
|
{
|
|
|
|
if (cached_ptr && !(*cached_ptr)) {
|
|
|
|
if (!flags || (state->state & flags)) {
|
|
|
|
*cached_ptr = state;
|
|
|
|
refcount_inc(&state->refs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cache_state(struct extent_state *state,
|
|
|
|
struct extent_state **cached_ptr)
|
|
|
|
{
|
|
|
|
return cache_state_if_flags(state, cached_ptr,
|
|
|
|
EXTENT_LOCKED | EXTENT_BOUNDARY);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the first state struct with 'bits' set after 'start', and return it.
|
|
|
|
* tree->lock must be held. NULL will returned if nothing was found after
|
|
|
|
* 'start'.
|
|
|
|
*/
|
|
|
|
static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
|
|
|
|
u64 start, u32 bits)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our range
|
|
|
|
* starts.
|
|
|
|
*/
|
2022-09-09 21:53:34 +00:00
|
|
|
state = tree_search(tree, start);
|
2022-09-09 21:53:33 +00:00
|
|
|
while (state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
if (state->end >= start && (state->state & bits))
|
|
|
|
return state;
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the first offset in the io tree with one or more @bits set.
|
|
|
|
*
|
|
|
|
* Note: If there are multiple bits set in @bits, any of them will match.
|
|
|
|
*
|
2023-06-30 15:03:49 +00:00
|
|
|
* Return true if we find something, and update @start_ret and @end_ret.
|
|
|
|
* Return false if we found nothing.
|
2022-09-09 21:53:29 +00:00
|
|
|
*/
|
2023-06-30 15:03:49 +00:00
|
|
|
bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits,
|
|
|
|
struct extent_state **cached_state)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state;
|
2023-06-30 15:03:49 +00:00
|
|
|
bool ret = false;
|
2022-09-09 21:53:29 +00:00
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
if (cached_state && *cached_state) {
|
|
|
|
state = *cached_state;
|
|
|
|
if (state->end == start - 1 && extent_state_in_tree(state)) {
|
|
|
|
while ((state = next_state(state)) != NULL) {
|
|
|
|
if (state->state & bits)
|
btrfs: make sure we cache next state in find_first_extent_bit()
Currently, at find_first_extent_bit(), when we are given a cached extent
state that happens to have its end offset match the desired range start,
we find the next extent state using that cached state, with next_state()
calls, and then return it.
We then try to cache that next state by calling cache_state_if_flags(),
but that will not cache the state because we haven't reset *cached_state
to NULL, so we end up with the cached_state unchanged, and if the caller
is iterating over extent states in the io tree, its next call to
find_first_extent_bit() will not use the current cached state as its end
offset does not match the minimum start range offset, therefore the cached
state is reset and we have to search the rbtree to find the next suitable
extent state record.
So fix this by resetting the cached state to NULL (and dropping our ref
on it) when we have a suitable cached state and we found a next state by
using next_state() starting from the cached state. This makes use cases
of calling find_first_extent_bit() to go over all ranges in the io tree
to do a single rbtree full search, only on the first call, and the next
calls will just do next_state() (rb_next() wrapper) calls, which is more
efficient.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:09 +00:00
|
|
|
break;
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
btrfs: make sure we cache next state in find_first_extent_bit()
Currently, at find_first_extent_bit(), when we are given a cached extent
state that happens to have its end offset match the desired range start,
we find the next extent state using that cached state, with next_state()
calls, and then return it.
We then try to cache that next state by calling cache_state_if_flags(),
but that will not cache the state because we haven't reset *cached_state
to NULL, so we end up with the cached_state unchanged, and if the caller
is iterating over extent states in the io tree, its next call to
find_first_extent_bit() will not use the current cached state as its end
offset does not match the minimum start range offset, therefore the cached
state is reset and we have to search the rbtree to find the next suitable
extent state record.
So fix this by resetting the cached state to NULL (and dropping our ref
on it) when we have a suitable cached state and we found a next state by
using next_state() starting from the cached state. This makes use cases
of calling find_first_extent_bit() to go over all ranges in the io tree
to do a single rbtree full search, only on the first call, and the next
calls will just do next_state() (rb_next() wrapper) calls, which is more
efficient.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:09 +00:00
|
|
|
/*
|
|
|
|
* If we found the next extent state, clear cached_state
|
|
|
|
* so that we can cache the next extent state below and
|
|
|
|
* avoid future calls going over the same extent state
|
|
|
|
* again. If we haven't found any, clear as well since
|
|
|
|
* it's now useless.
|
|
|
|
*/
|
2022-09-09 21:53:29 +00:00
|
|
|
free_extent_state(*cached_state);
|
|
|
|
*cached_state = NULL;
|
btrfs: make sure we cache next state in find_first_extent_bit()
Currently, at find_first_extent_bit(), when we are given a cached extent
state that happens to have its end offset match the desired range start,
we find the next extent state using that cached state, with next_state()
calls, and then return it.
We then try to cache that next state by calling cache_state_if_flags(),
but that will not cache the state because we haven't reset *cached_state
to NULL, so we end up with the cached_state unchanged, and if the caller
is iterating over extent states in the io tree, its next call to
find_first_extent_bit() will not use the current cached state as its end
offset does not match the minimum start range offset, therefore the cached
state is reset and we have to search the rbtree to find the next suitable
extent state record.
So fix this by resetting the cached state to NULL (and dropping our ref
on it) when we have a suitable cached state and we found a next state by
using next_state() starting from the cached state. This makes use cases
of calling find_first_extent_bit() to go over all ranges in the io tree
to do a single rbtree full search, only on the first call, and the next
calls will just do next_state() (rb_next() wrapper) calls, which is more
efficient.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:09 +00:00
|
|
|
if (state)
|
|
|
|
goto got_it;
|
2022-09-09 21:53:29 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
free_extent_state(*cached_state);
|
|
|
|
*cached_state = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
state = find_first_extent_bit_state(tree, start, bits);
|
|
|
|
got_it:
|
|
|
|
if (state) {
|
|
|
|
cache_state_if_flags(state, cached_state, 0);
|
|
|
|
*start_ret = state->start;
|
|
|
|
*end_ret = state->end;
|
2023-06-30 15:03:49 +00:00
|
|
|
ret = true;
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find a contiguous area of bits
|
|
|
|
*
|
|
|
|
* @tree: io tree to check
|
|
|
|
* @start: offset to start the search from
|
|
|
|
* @start_ret: the first offset we found with the bits set
|
|
|
|
* @end_ret: the final contiguous range of the bits that were set
|
|
|
|
* @bits: bits to look for
|
|
|
|
*
|
|
|
|
* set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
|
|
|
|
* to set bits appropriately, and then merge them again. During this time it
|
|
|
|
* will drop the tree->lock, so use this helper if you want to find the actual
|
|
|
|
* contiguous area for given bits. We will search to the first bit we find, and
|
|
|
|
* then walk down the tree until we find a non-contiguous area. The area
|
|
|
|
* returned will be the full contiguous area with the bits set.
|
|
|
|
*/
|
|
|
|
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
int ret = 1;
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
state = find_first_extent_bit_state(tree, start, bits);
|
|
|
|
if (state) {
|
|
|
|
*start_ret = state->start;
|
|
|
|
*end_ret = state->end;
|
|
|
|
while ((state = next_state(state)) != NULL) {
|
|
|
|
if (state->start > (*end_ret + 1))
|
|
|
|
break;
|
|
|
|
*end_ret = state->end;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find a contiguous range of bytes in the file marked as delalloc, not more
|
|
|
|
* than 'max_bytes'. start and end are used to return the range,
|
|
|
|
*
|
|
|
|
* True is returned if we find something, false if nothing was in the tree.
|
|
|
|
*/
|
|
|
|
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
|
|
|
|
u64 *end, u64 max_bytes,
|
|
|
|
struct extent_state **cached_state)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
u64 cur_start = *start;
|
|
|
|
bool found = false;
|
|
|
|
u64 total_bytes = 0;
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our range
|
|
|
|
* starts.
|
|
|
|
*/
|
2022-09-09 21:53:34 +00:00
|
|
|
state = tree_search(tree, cur_start);
|
|
|
|
if (!state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
*end = (u64)-1;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:33 +00:00
|
|
|
while (state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
if (found && (state->start != cur_start ||
|
|
|
|
(state->state & EXTENT_BOUNDARY))) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (!(state->state & EXTENT_DELALLOC)) {
|
|
|
|
if (!found)
|
|
|
|
*end = state->end;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
*start = state->start;
|
|
|
|
*cached_state = state;
|
|
|
|
refcount_inc(&state->refs);
|
|
|
|
}
|
|
|
|
found = true;
|
|
|
|
*end = state->end;
|
|
|
|
cur_start = state->end + 1;
|
|
|
|
total_bytes += state->end - state->start + 1;
|
|
|
|
if (total_bytes >= max_bytes)
|
|
|
|
break;
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set some bits on a range in the tree. This may require allocations or
|
2023-05-24 23:04:39 +00:00
|
|
|
* sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for
|
|
|
|
* GFP_NOWAIT.
|
2022-09-09 21:53:29 +00:00
|
|
|
*
|
|
|
|
* If any of the exclusive bits are set, this will fail with -EEXIST if some
|
2022-09-30 20:45:12 +00:00
|
|
|
* part of the range already has the desired bits set. The extent_state of the
|
|
|
|
* existing range is returned in failed_state in this case, and the start of the
|
|
|
|
* existing range is returned in failed_start. failed_state is used as an
|
|
|
|
* optimization for wait_extent_bit, failed_start must be used as the source of
|
|
|
|
* truth as failed_state may have changed since we returned.
|
2022-09-09 21:53:29 +00:00
|
|
|
*
|
|
|
|
* [start, end] is inclusive This takes the tree lock.
|
|
|
|
*/
|
2022-09-09 21:53:41 +00:00
|
|
|
static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
u32 bits, u64 *failed_start,
|
2022-09-30 20:45:12 +00:00
|
|
|
struct extent_state **failed_state,
|
2022-09-09 21:53:41 +00:00
|
|
|
struct extent_state **cached_state,
|
2023-05-24 23:04:39 +00:00
|
|
|
struct extent_changeset *changeset)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *prealloc = NULL;
|
2022-12-16 20:15:55 +00:00
|
|
|
struct rb_node **p = NULL;
|
|
|
|
struct rb_node *parent = NULL;
|
2022-09-09 21:53:29 +00:00
|
|
|
int err = 0;
|
|
|
|
u64 last_start;
|
|
|
|
u64 last_end;
|
2022-09-09 21:53:39 +00:00
|
|
|
u32 exclusive_bits = (bits & EXTENT_LOCKED);
|
2023-05-24 23:04:39 +00:00
|
|
|
gfp_t mask;
|
2022-09-09 21:53:29 +00:00
|
|
|
|
2023-05-24 23:04:37 +00:00
|
|
|
set_gfp_mask_from_bits(&bits, &mask);
|
2022-09-09 21:53:29 +00:00
|
|
|
btrfs_debug_check_extent_io_range(tree, start, end);
|
|
|
|
trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
|
|
|
|
|
|
|
|
if (exclusive_bits)
|
|
|
|
ASSERT(failed_start);
|
|
|
|
else
|
2022-09-30 20:45:12 +00:00
|
|
|
ASSERT(failed_start == NULL && failed_state == NULL);
|
2022-09-09 21:53:29 +00:00
|
|
|
again:
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc) {
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* Don't care for allocation failure here because we might end
|
|
|
|
* up not needing the pre-allocated extent state at all, which
|
|
|
|
* is the case if we only have in the tree extent states that
|
|
|
|
* cover our input range and don't cover too any other range.
|
|
|
|
* If we end up needing a new extent state we allocate it later.
|
|
|
|
*/
|
|
|
|
prealloc = alloc_extent_state(mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
if (cached_state && *cached_state) {
|
|
|
|
state = *cached_state;
|
|
|
|
if (state->start <= start && state->end > start &&
|
2022-09-09 21:53:35 +00:00
|
|
|
extent_state_in_tree(state))
|
2022-09-09 21:53:29 +00:00
|
|
|
goto hit_next;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our range
|
|
|
|
* starts.
|
|
|
|
*/
|
2022-09-09 21:53:35 +00:00
|
|
|
state = tree_search_for_insert(tree, start, &p, &parent);
|
|
|
|
if (!state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
prealloc->start = start;
|
|
|
|
prealloc->end = end;
|
|
|
|
insert_state_fast(tree, prealloc, p, parent, bits, changeset);
|
|
|
|
cache_state(prealloc, cached_state);
|
|
|
|
prealloc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
hit_next:
|
|
|
|
last_start = state->start;
|
|
|
|
last_end = state->end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
*
|
|
|
|
* Just lock what we found and keep going
|
|
|
|
*/
|
|
|
|
if (state->start == start && state->end <= end) {
|
|
|
|
if (state->state & exclusive_bits) {
|
|
|
|
*failed_start = state->start;
|
2022-09-30 20:45:12 +00:00
|
|
|
cache_state(state, failed_state);
|
2022-09-09 21:53:29 +00:00
|
|
|
err = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
set_state_bits(tree, state, bits, changeset);
|
|
|
|
cache_state(state, cached_state);
|
|
|
|
merge_state(tree, state);
|
|
|
|
if (last_end == (u64)-1)
|
|
|
|
goto out;
|
|
|
|
start = last_end + 1;
|
|
|
|
state = next_state(state);
|
|
|
|
if (start < end && state && state->start == start &&
|
|
|
|
!need_resched())
|
|
|
|
goto hit_next;
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
* or
|
|
|
|
* | ------------- state -------------- |
|
|
|
|
*
|
|
|
|
* We need to split the extent we found, and may flip bits on second
|
|
|
|
* half.
|
|
|
|
*
|
|
|
|
* If the extent we found extends past our range, we just split and
|
|
|
|
* search again. It'll get split again the next time though.
|
|
|
|
*
|
|
|
|
* If the extent we found is inside our range, we set the desired bit
|
|
|
|
* on it.
|
|
|
|
*/
|
|
|
|
if (state->start < start) {
|
|
|
|
if (state->state & exclusive_bits) {
|
|
|
|
*failed_start = start;
|
2022-09-30 20:45:12 +00:00
|
|
|
cache_state(state, failed_state);
|
2022-09-09 21:53:29 +00:00
|
|
|
err = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this extent already has all the bits we want set, then
|
|
|
|
* skip it, not necessary to split it or do anything with it.
|
|
|
|
*/
|
|
|
|
if ((state->state & bits) == bits) {
|
|
|
|
start = state->end + 1;
|
|
|
|
cache_state(state, cached_state);
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
err = split_state(tree, state, prealloc, start);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
|
|
|
|
prealloc = NULL;
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
if (state->end <= end) {
|
|
|
|
set_state_bits(tree, state, bits, changeset);
|
|
|
|
cache_state(state, cached_state);
|
|
|
|
merge_state(tree, state);
|
|
|
|
if (last_end == (u64)-1)
|
|
|
|
goto out;
|
|
|
|
start = last_end + 1;
|
|
|
|
state = next_state(state);
|
|
|
|
if (start < end && state && state->start == start &&
|
|
|
|
!need_resched())
|
|
|
|
goto hit_next;
|
|
|
|
}
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state | or | state |
|
|
|
|
*
|
|
|
|
* There's a hole, we need to insert something in it and ignore the
|
|
|
|
* extent we found.
|
|
|
|
*/
|
|
|
|
if (state->start > start) {
|
|
|
|
u64 this_end;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
struct extent_state *inserted_state;
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
if (end < last_start)
|
|
|
|
this_end = end;
|
|
|
|
else
|
|
|
|
this_end = last_start - 1;
|
|
|
|
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid to free 'prealloc' if it can be merged with the later
|
|
|
|
* extent.
|
|
|
|
*/
|
|
|
|
prealloc->start = start;
|
|
|
|
prealloc->end = this_end;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
inserted_state = insert_state(tree, prealloc, bits, changeset);
|
|
|
|
if (IS_ERR(inserted_state)) {
|
|
|
|
err = PTR_ERR(inserted_state);
|
2022-09-09 21:53:29 +00:00
|
|
|
extent_io_tree_panic(tree, err);
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
}
|
2022-09-09 21:53:29 +00:00
|
|
|
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
cache_state(inserted_state, cached_state);
|
|
|
|
if (inserted_state == prealloc)
|
|
|
|
prealloc = NULL;
|
2022-09-09 21:53:29 +00:00
|
|
|
start = this_end + 1;
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
*
|
|
|
|
* We need to split the extent, and set the bit on the first half
|
|
|
|
*/
|
|
|
|
if (state->start <= end && state->end > end) {
|
|
|
|
if (state->state & exclusive_bits) {
|
|
|
|
*failed_start = start;
|
2022-09-30 20:45:12 +00:00
|
|
|
cache_state(state, failed_state);
|
2022-09-09 21:53:29 +00:00
|
|
|
err = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
2022-10-14 14:00:41 +00:00
|
|
|
if (!prealloc)
|
|
|
|
goto search_again;
|
2022-09-09 21:53:29 +00:00
|
|
|
err = split_state(tree, state, prealloc, end + 1);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
|
|
|
|
set_state_bits(tree, prealloc, bits, changeset);
|
|
|
|
cache_state(prealloc, cached_state);
|
|
|
|
merge_state(tree, prealloc);
|
|
|
|
prealloc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
search_again:
|
|
|
|
if (start > end)
|
|
|
|
goto out;
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
if (gfpflags_allow_blocking(mask))
|
|
|
|
cond_resched();
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
if (prealloc)
|
|
|
|
free_extent_state(prealloc);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:41 +00:00
|
|
|
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2023-05-24 23:04:39 +00:00
|
|
|
u32 bits, struct extent_state **cached_state)
|
2022-09-09 21:53:41 +00:00
|
|
|
{
|
2022-09-30 20:45:12 +00:00
|
|
|
return __set_extent_bit(tree, start, end, bits, NULL, NULL,
|
2023-05-24 23:04:39 +00:00
|
|
|
cached_state, NULL);
|
2022-09-09 21:53:41 +00:00
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* Convert all bits in a given range from one bit to another
|
|
|
|
*
|
|
|
|
* @tree: the io tree to search
|
|
|
|
* @start: the start offset in bytes
|
|
|
|
* @end: the end offset in bytes (inclusive)
|
|
|
|
* @bits: the bits to set in this range
|
|
|
|
* @clear_bits: the bits to clear in this range
|
|
|
|
* @cached_state: state that we're going to cache
|
|
|
|
*
|
|
|
|
* This will go through and set bits for the given range. If any states exist
|
|
|
|
* already in this range they are set with the given bit and cleared of the
|
|
|
|
* clear_bits. This is only meant to be used by things that are mergeable, ie.
|
|
|
|
* converting from say DELALLOC to DIRTY. This is not meant to be used with
|
|
|
|
* boundary bits like LOCK.
|
|
|
|
*
|
|
|
|
* All allocations are done with GFP_NOFS.
|
|
|
|
*/
|
|
|
|
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
u32 bits, u32 clear_bits,
|
|
|
|
struct extent_state **cached_state)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *prealloc = NULL;
|
2022-12-16 20:15:55 +00:00
|
|
|
struct rb_node **p = NULL;
|
|
|
|
struct rb_node *parent = NULL;
|
2022-09-09 21:53:29 +00:00
|
|
|
int err = 0;
|
|
|
|
u64 last_start;
|
|
|
|
u64 last_end;
|
|
|
|
bool first_iteration = true;
|
|
|
|
|
|
|
|
btrfs_debug_check_extent_io_range(tree, start, end);
|
|
|
|
trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
|
|
|
|
clear_bits);
|
|
|
|
|
|
|
|
again:
|
|
|
|
if (!prealloc) {
|
|
|
|
/*
|
|
|
|
* Best effort, don't worry if extent state allocation fails
|
|
|
|
* here for the first iteration. We might have a cached state
|
|
|
|
* that matches exactly the target range, in which case no
|
|
|
|
* extent state allocations are needed. We'll only know this
|
|
|
|
* after locking the tree.
|
|
|
|
*/
|
|
|
|
prealloc = alloc_extent_state(GFP_NOFS);
|
|
|
|
if (!prealloc && !first_iteration)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
if (cached_state && *cached_state) {
|
|
|
|
state = *cached_state;
|
|
|
|
if (state->start <= start && state->end > start &&
|
2022-09-09 21:53:35 +00:00
|
|
|
extent_state_in_tree(state))
|
2022-09-09 21:53:29 +00:00
|
|
|
goto hit_next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our range
|
|
|
|
* starts.
|
|
|
|
*/
|
2022-09-09 21:53:35 +00:00
|
|
|
state = tree_search_for_insert(tree, start, &p, &parent);
|
|
|
|
if (!state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
|
|
|
if (!prealloc) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
prealloc->start = start;
|
|
|
|
prealloc->end = end;
|
|
|
|
insert_state_fast(tree, prealloc, p, parent, bits, NULL);
|
|
|
|
cache_state(prealloc, cached_state);
|
|
|
|
prealloc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
hit_next:
|
|
|
|
last_start = state->start;
|
|
|
|
last_end = state->end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
*
|
|
|
|
* Just lock what we found and keep going.
|
|
|
|
*/
|
|
|
|
if (state->start == start && state->end <= end) {
|
|
|
|
set_state_bits(tree, state, bits, NULL);
|
|
|
|
cache_state(state, cached_state);
|
|
|
|
state = clear_state_bit(tree, state, clear_bits, 0, NULL);
|
|
|
|
if (last_end == (u64)-1)
|
|
|
|
goto out;
|
|
|
|
start = last_end + 1;
|
|
|
|
if (start < end && state && state->start == start &&
|
|
|
|
!need_resched())
|
|
|
|
goto hit_next;
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
* or
|
|
|
|
* | ------------- state -------------- |
|
|
|
|
*
|
|
|
|
* We need to split the extent we found, and may flip bits on second
|
|
|
|
* half.
|
|
|
|
*
|
|
|
|
* If the extent we found extends past our range, we just split and
|
|
|
|
* search again. It'll get split again the next time though.
|
|
|
|
*
|
|
|
|
* If the extent we found is inside our range, we set the desired bit
|
|
|
|
* on it.
|
|
|
|
*/
|
|
|
|
if (state->start < start) {
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
|
|
|
if (!prealloc) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
err = split_state(tree, state, prealloc, start);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
prealloc = NULL;
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
if (state->end <= end) {
|
|
|
|
set_state_bits(tree, state, bits, NULL);
|
|
|
|
cache_state(state, cached_state);
|
|
|
|
state = clear_state_bit(tree, state, clear_bits, 0, NULL);
|
|
|
|
if (last_end == (u64)-1)
|
|
|
|
goto out;
|
|
|
|
start = last_end + 1;
|
|
|
|
if (start < end && state && state->start == start &&
|
|
|
|
!need_resched())
|
|
|
|
goto hit_next;
|
|
|
|
}
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state | or | state |
|
|
|
|
*
|
|
|
|
* There's a hole, we need to insert something in it and ignore the
|
|
|
|
* extent we found.
|
|
|
|
*/
|
|
|
|
if (state->start > start) {
|
|
|
|
u64 this_end;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
struct extent_state *inserted_state;
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
if (end < last_start)
|
|
|
|
this_end = end;
|
|
|
|
else
|
|
|
|
this_end = last_start - 1;
|
|
|
|
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
|
|
|
if (!prealloc) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid to free 'prealloc' if it can be merged with the later
|
|
|
|
* extent.
|
|
|
|
*/
|
|
|
|
prealloc->start = start;
|
|
|
|
prealloc->end = this_end;
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
inserted_state = insert_state(tree, prealloc, bits, NULL);
|
|
|
|
if (IS_ERR(inserted_state)) {
|
|
|
|
err = PTR_ERR(inserted_state);
|
2022-09-09 21:53:29 +00:00
|
|
|
extent_io_tree_panic(tree, err);
|
btrfs: make extent state merges more efficient during insertions
When inserting a new extent state record into an io tree that happens to
be mergeable, we currently do the following:
1) Insert the extent state record in the io tree's rbtree. This requires
going down the tree to find where to insert it, and during the
insertion we often need to balance the rbtree;
2) We then check if the previous node is mergeable, so we call rb_prev()
to find it, which requires some looping to find the previous node;
3) If the previous node is mergeable, we adjust our node to include the
range of the previous node and then delete the previous node from the
rbtree, which again may need to balance the rbtree;
4) Then we check if the next node is mergeable with the node we inserted,
so we call rb_next(), which requires some looping too. If the next node
is indeed mergeable, we expand the range of our node to include the
next node's range and then delete the next node from the rbtree, which
again may need to balance the tree.
So these are quite of lot of iterations and looping over the rbtree, and
some of the operations may need to rebalance the rb tree. This can be made
a bit more efficient by:
1) When iterating the rbtree, once we find a node that is mergeable with
the node we want to insert, we can just adjust that node's range with
the range of the node to insert - this avoids continuing iterating
over the tree and deleting a node from the rbtree;
2) If we expand the range of a mergeable node, then we find the next or
the previous node, depending on other we merged a range to the right or
to the left of the node we are currently at during the iteration. This
merging is as before, we find the next or previous node with rb_next()
or rb_prev() and if that other node is mergeable with the current one,
we adjust the range of the current node and remove the other node from
the rbtree;
3) Whenever we need to insert the new extent state record it's because
we don't have any extent state record in the rbtree which can be
merged, so we can remove the call to merge_state() after the insertion,
saving rb_next() and rb_prev() calls, which require some looping.
So update the insertion function insert_state() to have this behaviour.
Running dbench for 120 seconds and capturing the execution times of
set_extent_bit() at pin_down_extent(), resulted in the following data
(time values are in nanoseconds):
Before this change:
Count: 2278299
Range: 0.000 - 4003728.000; Mean: 713.436; Median: 612.000; Stddev: 3606.952
Percentiles: 90th: 1187.000; 95th: 1350.000; 99th: 1724.000
0.000 - 7.534: 5 |
7.534 - 35.418: 36 |
35.418 - 154.403: 273 |
154.403 - 662.138: 1244016 #####################################################
662.138 - 2828.745: 1031335 ############################################
2828.745 - 12074.102: 1395 |
12074.102 - 51525.930: 806 |
51525.930 - 219874.955: 162 |
219874.955 - 938254.688: 22 |
938254.688 - 4003728.000: 3 |
After this change:
Count: 2275862
Range: 0.000 - 1605175.000; Mean: 678.903; Median: 590.000; Stddev: 2149.785
Percentiles: 90th: 1105.000; 95th: 1245.000; 99th: 1590.000
0.000 - 10.219: 10 |
10.219 - 40.957: 36 |
40.957 - 155.907: 262 |
155.907 - 585.789: 1127214 ####################################################
585.789 - 2193.431: 1145134 #####################################################
2193.431 - 8205.578: 1648 |
8205.578 - 30689.378: 1039 |
30689.378 - 114772.699: 362 |
114772.699 - 429221.537: 52 |
429221.537 - 1605175.000: 10 |
Maximum duration (range), average duration, percentiles and standard
deviation are all better.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-22 10:39:02 +00:00
|
|
|
}
|
|
|
|
cache_state(inserted_state, cached_state);
|
|
|
|
if (inserted_state == prealloc)
|
|
|
|
prealloc = NULL;
|
2022-09-09 21:53:29 +00:00
|
|
|
start = this_end + 1;
|
|
|
|
goto search_again;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* | ---- desired range ---- |
|
|
|
|
* | state |
|
|
|
|
*
|
|
|
|
* We need to split the extent, and set the bit on the first half.
|
|
|
|
*/
|
|
|
|
if (state->start <= end && state->end > end) {
|
|
|
|
prealloc = alloc_extent_state_atomic(prealloc);
|
|
|
|
if (!prealloc) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = split_state(tree, state, prealloc, end + 1);
|
|
|
|
if (err)
|
|
|
|
extent_io_tree_panic(tree, err);
|
|
|
|
|
|
|
|
set_state_bits(tree, prealloc, bits, NULL);
|
|
|
|
cache_state(prealloc, cached_state);
|
|
|
|
clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
|
|
|
|
prealloc = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
search_again:
|
|
|
|
if (start > end)
|
|
|
|
goto out;
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
cond_resched();
|
|
|
|
first_iteration = false;
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
if (prealloc)
|
|
|
|
free_extent_state(prealloc);
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:28 +00:00
|
|
|
/*
|
|
|
|
* Find the first range that has @bits not set. This range could start before
|
|
|
|
* @start.
|
|
|
|
*
|
|
|
|
* @tree: the tree to search
|
|
|
|
* @start: offset at/after which the found extent should start
|
|
|
|
* @start_ret: records the beginning of the range
|
|
|
|
* @end_ret: records the end of the range (inclusive)
|
|
|
|
* @bits: the set of bits which must be unset
|
|
|
|
*
|
|
|
|
* Since unallocated range is also considered one which doesn't have the bits
|
|
|
|
* set it's possible that @end_ret contains -1, this happens in case the range
|
|
|
|
* spans (last_range_end, end of device]. In this case it's up to the caller to
|
|
|
|
* trim @end_ret to the appropriate size.
|
|
|
|
*/
|
|
|
|
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits)
|
|
|
|
{
|
|
|
|
struct extent_state *state;
|
2022-11-18 20:09:42 +00:00
|
|
|
struct extent_state *prev = NULL, *next = NULL;
|
2022-09-09 21:53:28 +00:00
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
|
|
|
|
/* Find first extent with bits cleared */
|
|
|
|
while (1) {
|
2022-09-09 21:53:36 +00:00
|
|
|
state = tree_search_prev_next(tree, start, &prev, &next);
|
|
|
|
if (!state && !next && !prev) {
|
2022-09-09 21:53:28 +00:00
|
|
|
/*
|
|
|
|
* Tree is completely empty, send full range and let
|
|
|
|
* caller deal with it
|
|
|
|
*/
|
|
|
|
*start_ret = 0;
|
|
|
|
*end_ret = -1;
|
|
|
|
goto out;
|
2022-09-09 21:53:36 +00:00
|
|
|
} else if (!state && !next) {
|
2022-09-09 21:53:28 +00:00
|
|
|
/*
|
|
|
|
* We are past the last allocated chunk, set start at
|
|
|
|
* the end of the last extent.
|
|
|
|
*/
|
2022-09-09 21:53:36 +00:00
|
|
|
*start_ret = prev->end + 1;
|
2022-09-09 21:53:28 +00:00
|
|
|
*end_ret = -1;
|
|
|
|
goto out;
|
2022-09-09 21:53:36 +00:00
|
|
|
} else if (!state) {
|
|
|
|
state = next;
|
2022-09-09 21:53:28 +00:00
|
|
|
}
|
2022-09-09 21:53:36 +00:00
|
|
|
|
2022-09-09 21:53:28 +00:00
|
|
|
/*
|
2022-09-09 21:53:36 +00:00
|
|
|
* At this point 'state' either contains 'start' or start is
|
|
|
|
* before 'state'
|
2022-09-09 21:53:28 +00:00
|
|
|
*/
|
|
|
|
if (in_range(start, state->start, state->end - state->start + 1)) {
|
|
|
|
if (state->state & bits) {
|
|
|
|
/*
|
|
|
|
* |--range with bits sets--|
|
|
|
|
* |
|
|
|
|
* start
|
|
|
|
*/
|
|
|
|
start = state->end + 1;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* 'start' falls within a range that doesn't
|
|
|
|
* have the bits set, so take its start as the
|
|
|
|
* beginning of the desired range
|
|
|
|
*
|
|
|
|
* |--range with bits cleared----|
|
|
|
|
* |
|
|
|
|
* start
|
|
|
|
*/
|
|
|
|
*start_ret = state->start;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* |---prev range---|---hole/unset---|---node range---|
|
|
|
|
* |
|
|
|
|
* start
|
|
|
|
*
|
|
|
|
* or
|
|
|
|
*
|
|
|
|
* |---hole/unset--||--first node--|
|
|
|
|
* 0 |
|
|
|
|
* start
|
|
|
|
*/
|
2022-09-09 21:53:36 +00:00
|
|
|
if (prev)
|
|
|
|
*start_ret = prev->end + 1;
|
|
|
|
else
|
2022-09-09 21:53:28 +00:00
|
|
|
*start_ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the longest stretch from start until an entry which has the
|
|
|
|
* bits set
|
|
|
|
*/
|
2022-09-09 21:53:33 +00:00
|
|
|
while (state) {
|
2022-09-09 21:53:28 +00:00
|
|
|
if (state->end >= start && !(state->state & bits)) {
|
|
|
|
*end_ret = state->end;
|
|
|
|
} else {
|
|
|
|
*end_ret = state->start - 1;
|
|
|
|
break;
|
|
|
|
}
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:28 +00:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
2022-11-11 11:50:33 +00:00
|
|
|
* Count the number of bytes in the tree that have a given bit(s) set for a
|
|
|
|
* given range.
|
|
|
|
*
|
|
|
|
* @tree: The io tree to search.
|
|
|
|
* @start: The start offset of the range. This value is updated to the
|
|
|
|
* offset of the first byte found with the given bit(s), so it
|
|
|
|
* can end up being bigger than the initial value.
|
|
|
|
* @search_end: The end offset (inclusive value) of the search range.
|
|
|
|
* @max_bytes: The maximum byte count we are interested. The search stops
|
|
|
|
* once it reaches this count.
|
|
|
|
* @bits: The bits the range must have in order to be accounted for.
|
|
|
|
* If multiple bits are set, then only subranges that have all
|
|
|
|
* the bits set are accounted for.
|
|
|
|
* @contig: Indicate if we should ignore holes in the range or not. If
|
|
|
|
* this is true, then stop once we find a hole.
|
|
|
|
* @cached_state: A cached state to be used across multiple calls to this
|
|
|
|
* function in order to speedup searches. Use NULL if this is
|
|
|
|
* called only once or if each call does not start where the
|
|
|
|
* previous one ended.
|
|
|
|
*
|
|
|
|
* Returns the total number of bytes found within the given range that have
|
|
|
|
* all given bits set. If the returned number of bytes is greater than zero
|
|
|
|
* then @start is updated with the offset of the first byte with the bits set.
|
2022-09-09 21:53:29 +00:00
|
|
|
*/
|
|
|
|
u64 count_range_bits(struct extent_io_tree *tree,
|
|
|
|
u64 *start, u64 search_end, u64 max_bytes,
|
2022-11-11 11:50:32 +00:00
|
|
|
u32 bits, int contig,
|
|
|
|
struct extent_state **cached_state)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
2022-11-11 11:50:32 +00:00
|
|
|
struct extent_state *state = NULL;
|
|
|
|
struct extent_state *cached;
|
2022-09-09 21:53:29 +00:00
|
|
|
u64 cur_start = *start;
|
|
|
|
u64 total_bytes = 0;
|
|
|
|
u64 last = 0;
|
|
|
|
int found = 0;
|
|
|
|
|
2022-12-23 18:28:53 +00:00
|
|
|
if (WARN_ON(search_end < cur_start))
|
2022-09-09 21:53:29 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
2022-09-09 21:53:45 +00:00
|
|
|
|
2022-11-11 11:50:32 +00:00
|
|
|
if (!cached_state || !*cached_state)
|
|
|
|
goto search;
|
|
|
|
|
|
|
|
cached = *cached_state;
|
|
|
|
|
|
|
|
if (!extent_state_in_tree(cached))
|
|
|
|
goto search;
|
|
|
|
|
|
|
|
if (cached->start <= cur_start && cur_start <= cached->end) {
|
|
|
|
state = cached;
|
|
|
|
} else if (cached->start > cur_start) {
|
|
|
|
struct extent_state *prev;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The cached state starts after our search range's start. Check
|
|
|
|
* if the previous state record starts at or before the range we
|
|
|
|
* are looking for, and if so, use it - this is a common case
|
|
|
|
* when there are holes between records in the tree. If there is
|
|
|
|
* no previous state record, we can start from our cached state.
|
|
|
|
*/
|
|
|
|
prev = prev_state(cached);
|
|
|
|
if (!prev)
|
|
|
|
state = cached;
|
|
|
|
else if (prev->start <= cur_start && cur_start <= prev->end)
|
|
|
|
state = prev;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
|
|
|
* This search will find all the extents that end after our range
|
|
|
|
* starts.
|
|
|
|
*/
|
2022-11-11 11:50:32 +00:00
|
|
|
search:
|
|
|
|
if (!state)
|
|
|
|
state = tree_search(tree, cur_start);
|
|
|
|
|
2022-09-09 21:53:33 +00:00
|
|
|
while (state) {
|
2022-09-09 21:53:29 +00:00
|
|
|
if (state->start > search_end)
|
|
|
|
break;
|
|
|
|
if (contig && found && state->start > last + 1)
|
|
|
|
break;
|
|
|
|
if (state->end >= cur_start && (state->state & bits) == bits) {
|
|
|
|
total_bytes += min(search_end, state->end) + 1 -
|
|
|
|
max(cur_start, state->start);
|
|
|
|
if (total_bytes >= max_bytes)
|
|
|
|
break;
|
|
|
|
if (!found) {
|
|
|
|
*start = max(cur_start, state->start);
|
|
|
|
found = 1;
|
|
|
|
}
|
|
|
|
last = state->end;
|
|
|
|
} else if (contig && found) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
2022-11-11 11:50:32 +00:00
|
|
|
|
|
|
|
if (cached_state) {
|
|
|
|
free_extent_state(*cached_state);
|
|
|
|
*cached_state = state;
|
|
|
|
if (state)
|
|
|
|
refcount_inc(&state->refs);
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
spin_unlock(&tree->lock);
|
2022-11-11 11:50:32 +00:00
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
return total_bytes;
|
|
|
|
}
|
|
|
|
|
2023-09-11 23:09:23 +00:00
|
|
|
/*
|
|
|
|
* Check if the single @bit exists in the given range.
|
|
|
|
*/
|
|
|
|
bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
|
|
|
|
{
|
|
|
|
struct extent_state *state = NULL;
|
|
|
|
bool bitset = false;
|
|
|
|
|
|
|
|
ASSERT(is_power_of_2(bit));
|
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
state = tree_search(tree, start);
|
|
|
|
while (state && start <= end) {
|
|
|
|
if (state->start > end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (state->state & bit) {
|
|
|
|
bitset = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If state->end is (u64)-1, start will overflow to 0 */
|
|
|
|
start = state->end + 1;
|
|
|
|
if (start > end || start == 0)
|
|
|
|
break;
|
|
|
|
state = next_state(state);
|
|
|
|
}
|
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
return bitset;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:29 +00:00
|
|
|
/*
|
2020-08-14 09:35:16 +00:00
|
|
|
* Check if the whole range [@start,@end) contains the single @bit set.
|
2022-09-09 21:53:29 +00:00
|
|
|
*/
|
2020-08-14 09:35:16 +00:00
|
|
|
bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
|
|
|
|
struct extent_state *cached)
|
2022-09-09 21:53:29 +00:00
|
|
|
{
|
|
|
|
struct extent_state *state = NULL;
|
2020-08-14 09:35:16 +00:00
|
|
|
bool bitset = true;
|
|
|
|
|
|
|
|
ASSERT(is_power_of_2(bit));
|
2022-09-09 21:53:29 +00:00
|
|
|
|
|
|
|
spin_lock(&tree->lock);
|
|
|
|
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
|
|
|
|
cached->end > start)
|
2022-09-09 21:53:34 +00:00
|
|
|
state = cached;
|
2022-09-09 21:53:29 +00:00
|
|
|
else
|
2022-09-09 21:53:34 +00:00
|
|
|
state = tree_search(tree, start);
|
2022-09-09 21:53:33 +00:00
|
|
|
while (state && start <= end) {
|
2020-08-14 09:35:16 +00:00
|
|
|
if (state->start > start) {
|
|
|
|
bitset = false;
|
2022-09-09 21:53:29 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (state->start > end)
|
|
|
|
break;
|
|
|
|
|
2020-08-14 09:35:16 +00:00
|
|
|
if ((state->state & bit) == 0) {
|
|
|
|
bitset = false;
|
2022-09-09 21:53:29 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (state->end == (u64)-1)
|
|
|
|
break;
|
|
|
|
|
2020-08-14 09:35:16 +00:00
|
|
|
/*
|
|
|
|
* Last entry (if state->end is (u64)-1 and overflow happens),
|
|
|
|
* or next entry starts after the range.
|
|
|
|
*/
|
2022-09-09 21:53:29 +00:00
|
|
|
start = state->end + 1;
|
2020-08-14 09:35:16 +00:00
|
|
|
if (start > end || start == 0)
|
2022-09-09 21:53:29 +00:00
|
|
|
break;
|
2022-09-09 21:53:33 +00:00
|
|
|
state = next_state(state);
|
2022-09-09 21:53:29 +00:00
|
|
|
}
|
2022-09-09 21:53:33 +00:00
|
|
|
|
|
|
|
/* We ran out of states and were still inside of our range. */
|
2020-08-14 09:35:16 +00:00
|
|
|
if (!state)
|
|
|
|
bitset = false;
|
2022-09-09 21:53:29 +00:00
|
|
|
spin_unlock(&tree->lock);
|
|
|
|
return bitset;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:23 +00:00
|
|
|
/* Wrappers around set/clear extent bit */
|
|
|
|
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
u32 bits, struct extent_changeset *changeset)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We don't support EXTENT_LOCKED yet, as current changeset will
|
|
|
|
* record any bits changed, so for EXTENT_LOCKED case, it will
|
|
|
|
* either fail with -EEXIST or changeset will record the whole
|
|
|
|
* range.
|
|
|
|
*/
|
|
|
|
ASSERT(!(bits & EXTENT_LOCKED));
|
|
|
|
|
2023-05-24 23:04:39 +00:00
|
|
|
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
|
2022-09-09 21:53:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
u32 bits, struct extent_changeset *changeset)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Don't support EXTENT_LOCKED case, same reason as
|
|
|
|
* set_record_extent_bits().
|
|
|
|
*/
|
|
|
|
ASSERT(!(bits & EXTENT_LOCKED));
|
|
|
|
|
2023-05-24 23:04:39 +00:00
|
|
|
return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
|
2022-09-09 21:53:23 +00:00
|
|
|
}
|
|
|
|
|
2022-09-30 20:45:09 +00:00
|
|
|
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
struct extent_state **cached)
|
2022-09-09 21:53:23 +00:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
u64 failed_start;
|
|
|
|
|
2022-09-09 21:53:41 +00:00
|
|
|
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
|
2023-05-24 23:04:39 +00:00
|
|
|
NULL, cached, NULL);
|
2022-09-09 21:53:23 +00:00
|
|
|
if (err == -EEXIST) {
|
|
|
|
if (failed_start > start)
|
|
|
|
clear_extent_bit(tree, start, failed_start - 1,
|
2022-09-30 20:45:09 +00:00
|
|
|
EXTENT_LOCKED, cached);
|
2022-09-09 21:53:23 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:28 +00:00
|
|
|
/*
|
|
|
|
* Either insert or lock state struct between start and end use mask to tell
|
|
|
|
* us if waiting is desired.
|
|
|
|
*/
|
2022-09-09 21:53:43 +00:00
|
|
|
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
struct extent_state **cached_state)
|
2022-09-09 21:53:28 +00:00
|
|
|
{
|
2022-09-30 20:45:12 +00:00
|
|
|
struct extent_state *failed_state = NULL;
|
2022-09-09 21:53:28 +00:00
|
|
|
int err;
|
|
|
|
u64 failed_start;
|
|
|
|
|
btrfs: unlock locked extent area if we have contention
In production we hit the following deadlock
task 1 task 2 task 3
------ ------ ------
fiemap(file) falloc(file) fsync(file)
write(0, 1MiB)
btrfs_commit_transaction()
wait_on(!pending_ordered)
lock(512MiB, 1GiB)
start_transaction
wait_on_transaction
lock(0, 1GiB)
wait_extent_bit(512MiB)
task 4
------
finish_ordered_extent(0, 1MiB)
lock(0, 1MiB)
**DEADLOCK**
This occurs because when task 1 does it's lock, it locks everything from
0-512MiB, and then waits for the 512MiB chunk to unlock. task 2 will
never unlock because it's waiting on the transaction commit to happen,
the transaction commit is waiting for the outstanding ordered extents,
and then the ordered extent thread is blocked waiting on the 0-1MiB
range to unlock.
To fix this we have to clear anything we've locked so far, wait for the
extent_state that we contended on, and then try to re-lock the entire
range again.
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-30 20:45:08 +00:00
|
|
|
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
|
2023-05-24 23:04:39 +00:00
|
|
|
&failed_state, cached_state, NULL);
|
btrfs: unlock locked extent area if we have contention
In production we hit the following deadlock
task 1 task 2 task 3
------ ------ ------
fiemap(file) falloc(file) fsync(file)
write(0, 1MiB)
btrfs_commit_transaction()
wait_on(!pending_ordered)
lock(512MiB, 1GiB)
start_transaction
wait_on_transaction
lock(0, 1GiB)
wait_extent_bit(512MiB)
task 4
------
finish_ordered_extent(0, 1MiB)
lock(0, 1MiB)
**DEADLOCK**
This occurs because when task 1 does it's lock, it locks everything from
0-512MiB, and then waits for the 512MiB chunk to unlock. task 2 will
never unlock because it's waiting on the transaction commit to happen,
the transaction commit is waiting for the outstanding ordered extents,
and then the ordered extent thread is blocked waiting on the 0-1MiB
range to unlock.
To fix this we have to clear anything we've locked so far, wait for the
extent_state that we contended on, and then try to re-lock the entire
range again.
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-09-30 20:45:08 +00:00
|
|
|
while (err == -EEXIST) {
|
|
|
|
if (failed_start != start)
|
|
|
|
clear_extent_bit(tree, start, failed_start - 1,
|
|
|
|
EXTENT_LOCKED, cached_state);
|
|
|
|
|
2022-09-30 20:45:12 +00:00
|
|
|
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
|
|
|
|
&failed_state);
|
2022-09-09 21:53:41 +00:00
|
|
|
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
|
2022-09-30 20:45:12 +00:00
|
|
|
&failed_start, &failed_state,
|
2023-05-24 23:04:39 +00:00
|
|
|
cached_state, NULL);
|
2022-09-09 21:53:28 +00:00
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2022-09-09 21:53:21 +00:00
|
|
|
void __cold extent_state_free_cachep(void)
|
|
|
|
{
|
|
|
|
btrfs_extent_state_leak_debug_check();
|
|
|
|
kmem_cache_destroy(extent_state_cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __init extent_state_init_cachep(void)
|
|
|
|
{
|
|
|
|
extent_state_cache = kmem_cache_create("btrfs_extent_state",
|
|
|
|
sizeof(struct extent_state), 0,
|
|
|
|
SLAB_MEM_SPREAD, NULL);
|
|
|
|
if (!extent_state_cache)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|