btrfs: improve btree readahead for full send operations
Currently a full send operation uses the standard btree readahead when iterating over the subvolume/snapshot btree, which despite bringing good performance benefits, it could be improved in a few aspects for use cases such as full send operations, which are guaranteed to visit every node and leaf of a btree, in ascending and sequential order. The limitations of that standard btree readahead implementation are the following: 1) It only triggers readahead for leaves that are physically close to the leaf being read, within a 64K range; 2) It only triggers readahead for the next or previous leaves if the leaf being read is not currently in memory; 3) It never triggers readahead for nodes. So add a new readahead mode that addresses all these points and use it for full send operations. The following test script was used to measure the improvement on a box using an average, consumer grade, spinning disk and with 16GiB of RAM: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj MKFS_OPTIONS="--nodesize 16384" # default, just to be explicit MOUNT_OPTIONS="-o max_inline=2048" # default, just to be explicit mkfs.btrfs -f $MKFS_OPTIONS $DEV > /dev/null mount $MOUNT_OPTIONS $DEV $MNT # Create files with inline data to make it easier and faster to create # large btrees. add_files() { local total=$1 local start_offset=$2 local number_jobs=$3 local total_per_job=$(($total / $number_jobs)) echo "Creating $total new files using $number_jobs jobs" for ((n = 0; n < $number_jobs; n++)); do ( local start_num=$(($start_offset + $n * $total_per_job)) for ((i = 1; i <= $total_per_job; i++)); do local file_num=$((start_num + $i)) local file_path="$MNT/file_${file_num}" xfs_io -f -c "pwrite -S 0xab 0 2000" $file_path > /dev/null if [ $? -ne 0 ]; then echo "Failed creating file $file_path" break fi done ) & worker_pids[$n]=$! done wait ${worker_pids[@]} sync echo echo "btree node/leaf count: $(btrfs inspect-internal dump-tree -t 5 $DEV | egrep '^(node|leaf) ' | wc -l)" } initial_file_count=500000 add_files $initial_file_count 0 4 echo echo "Creating first snapshot..." btrfs subvolume snapshot -r $MNT $MNT/snap1 echo echo "Adding more files..." add_files $((initial_file_count / 4)) $initial_file_count 4 echo echo "Updating 1/50th of the initial files..." for ((i = 1; i < $initial_file_count; i += 50)); do xfs_io -c "pwrite -S 0xcd 0 20" $MNT/file_$i > /dev/null done echo echo "Creating second snapshot..." btrfs subvolume snapshot -r $MNT $MNT/snap2 umount $MNT echo 3 > /proc/sys/vm/drop_caches blockdev --flushbufs $DEV &> /dev/null hdparm -F $DEV &> /dev/null mount $MOUNT_OPTIONS $DEV $MNT echo echo "Testing full send..." start=$(date +%s) btrfs send $MNT/snap1 > /dev/null end=$(date +%s) echo echo "Full send took $((end - start)) seconds" umount $MNT The durations of the full send operation in seconds were the following: Before this change: 217 seconds After this change: 205 seconds (-5.7%) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
eafa4fd0ad
commit
ace75066ce
@ -1279,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
u64 search;
|
||||
u64 target;
|
||||
u64 nread = 0;
|
||||
u64 nread_max;
|
||||
struct extent_buffer *eb;
|
||||
u32 nr;
|
||||
u32 blocksize;
|
||||
u32 nscan = 0;
|
||||
|
||||
if (level != 1)
|
||||
if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
|
||||
return;
|
||||
|
||||
if (!path->nodes[level])
|
||||
@ -1292,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
|
||||
node = path->nodes[level];
|
||||
|
||||
/*
|
||||
* Since the time between visiting leaves is much shorter than the time
|
||||
* between visiting nodes, limit read ahead of nodes to 1, to avoid too
|
||||
* much IO at once (possibly random).
|
||||
*/
|
||||
if (path->reada == READA_FORWARD_ALWAYS) {
|
||||
if (level > 1)
|
||||
nread_max = node->fs_info->nodesize;
|
||||
else
|
||||
nread_max = SZ_128K;
|
||||
} else {
|
||||
nread_max = SZ_64K;
|
||||
}
|
||||
|
||||
search = btrfs_node_blockptr(node, slot);
|
||||
blocksize = fs_info->nodesize;
|
||||
eb = find_extent_buffer(fs_info, search);
|
||||
@ -1310,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
if (nr == 0)
|
||||
break;
|
||||
nr--;
|
||||
} else if (path->reada == READA_FORWARD) {
|
||||
} else if (path->reada == READA_FORWARD ||
|
||||
path->reada == READA_FORWARD_ALWAYS) {
|
||||
nr++;
|
||||
if (nr >= nritems)
|
||||
break;
|
||||
@ -1321,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
|
||||
break;
|
||||
}
|
||||
search = btrfs_node_blockptr(node, nr);
|
||||
if ((search <= target && target - search <= 65536) ||
|
||||
if (path->reada == READA_FORWARD_ALWAYS ||
|
||||
(search <= target && target - search <= 65536) ||
|
||||
(search > target && search - target <= 65536)) {
|
||||
btrfs_readahead_node_child(node, nr);
|
||||
nread += blocksize;
|
||||
}
|
||||
nscan++;
|
||||
if ((nread > 65536 || nscan > 32))
|
||||
if (nread > nread_max || nscan > 32)
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1436,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
|
||||
|
||||
tmp = find_extent_buffer(fs_info, blocknr);
|
||||
if (tmp) {
|
||||
if (p->reada == READA_FORWARD_ALWAYS)
|
||||
reada_for_search(fs_info, p, level, slot, key->objectid);
|
||||
|
||||
/* first we do an atomic uptodate check */
|
||||
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
|
||||
/*
|
||||
|
@ -342,6 +342,27 @@ struct btrfs_node {
|
||||
struct btrfs_key_ptr ptrs[];
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
/* Read ahead values for struct btrfs_path.reada */
|
||||
enum {
|
||||
READA_NONE,
|
||||
READA_BACK,
|
||||
READA_FORWARD,
|
||||
/*
|
||||
* Similar to READA_FORWARD but unlike it:
|
||||
*
|
||||
* 1) It will trigger readahead even for leaves that are not close to
|
||||
* each other on disk;
|
||||
* 2) It also triggers readahead for nodes;
|
||||
* 3) During a search, even when a node or leaf is already in memory, it
|
||||
* will still trigger readahead for other nodes and leaves that follow
|
||||
* it.
|
||||
*
|
||||
* This is meant to be used only when we know we are iterating over the
|
||||
* entire tree or a very large part of it.
|
||||
*/
|
||||
READA_FORWARD_ALWAYS,
|
||||
};
|
||||
|
||||
/*
|
||||
* btrfs_paths remember the path taken from the root down to the leaf.
|
||||
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
|
||||
@ -350,7 +371,6 @@ struct btrfs_node {
|
||||
* The slots array records the index of the item or block pointer
|
||||
* used while walking the tree.
|
||||
*/
|
||||
enum { READA_NONE, READA_BACK, READA_FORWARD };
|
||||
struct btrfs_path {
|
||||
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
|
||||
int slots[BTRFS_MAX_LEVEL];
|
||||
|
@ -6650,7 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
|
||||
path = alloc_path_for_send();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
path->reada = READA_FORWARD;
|
||||
path->reada = READA_FORWARD_ALWAYS;
|
||||
|
||||
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
|
||||
key.type = BTRFS_INODE_ITEM_KEY;
|
||||
|
Loading…
Reference in New Issue
Block a user