btrfs: scrub

This adds an initial implementation for scrub. It works quite
straightforward. The usermode issues an ioctl for each device in the
fs. For each device, it enumerates the allocated device chunks. For
each chunk, the contained extents are enumerated and the data checksums
fetched. The extents are read sequentially and the checksums verified.
If an error occurs (checksum or EIO), a good copy is searched for. If
one is found, the bad copy will be rewritten.
All enumerations happen from the commit roots. During a transaction
commit, the scrubs get paused and afterwards continue from the new
roots.

This commit is based on the series originally posted to linux-btrfs
with some improvements that resulted from comments from David Sterba,
Ilya Dryomov and Jan Schmidt.

Signed-off-by: Arne Jansen <sensille@gmx.net>
This commit is contained in:
Arne Jansen 2011-03-08 14:14:00 +01:00
parent 7cf96da3ec
commit a2de733c78
12 changed files with 1600 additions and 11 deletions

View File

@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o
compression.o delayed-ref.o relocation.o scrub.o

View File

@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/fs.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
@ -33,6 +34,7 @@
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
#include "ioctl.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@ -510,6 +512,12 @@ struct btrfs_extent_item_v0 {
/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
/*
* this flag is only used internally by scrub and may be changed at any time
* it is only declared here to avoid collisions
*/
#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
struct btrfs_tree_block_info {
struct btrfs_disk_key key;
u8 level;
@ -1077,6 +1085,17 @@ struct btrfs_fs_info {
void *bdev_holder;
/* private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
atomic_t scrub_pause_req;
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
/* filesystem state */
u64 fs_state;
};
@ -2472,8 +2491,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u64 isize);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
u64 end, struct list_head *list);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
/* inode.c */
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@ -2637,4 +2656,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
u64 *bytes_to_reserve);
void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
/* scrub.c */
int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
struct btrfs_scrub_progress *progress);
int btrfs_scrub_pause(struct btrfs_root *root);
int btrfs_scrub_pause_super(struct btrfs_root *root);
int btrfs_scrub_continue(struct btrfs_root *root);
int btrfs_scrub_continue_super(struct btrfs_root *root);
int btrfs_scrub_cancel(struct btrfs_root *root);
int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
#endif

View File

@ -1773,6 +1773,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_extents);
spin_lock_init(&fs_info->ordered_extent_lock);
mutex_init(&fs_info->scrub_lock);
atomic_set(&fs_info->scrubs_running, 0);
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
init_waitqueue_head(&fs_info->scrub_pause_wait);
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
btrfs_init_workers(&fs_info->scrub_workers, "scrub",
fs_info->thread_pool_size, &fs_info->generic_worker);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
@ -2599,6 +2610,7 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
btrfs_scrub_cancel(root);
btrfs_put_block_group_cache(fs_info);
/*

View File

@ -266,7 +266,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
struct list_head *list, int search_commit)
{
struct btrfs_key key;
struct btrfs_path *path;
@ -283,6 +283,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
path = btrfs_alloc_path();
BUG_ON(!path);
if (search_commit) {
path->skip_locking = 1;
path->reada = 2;
path->search_commit_root = 1;
}
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;

View File

@ -1007,7 +1007,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
bytenr + num_bytes - 1, &list);
bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;

View File

@ -42,6 +42,43 @@ struct btrfs_ioctl_vol_args_v2 {
char name[BTRFS_SUBVOL_NAME_MAX + 1];
};
/*
* structure to report errors and progress to userspace, either as a
* result of a finished scrub, a canceled scrub or a progress inquiry
*/
struct btrfs_scrub_progress {
__u64 data_extents_scrubbed; /* # of data extents scrubbed */
__u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
__u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
__u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
__u64 read_errors; /* # of read errors encountered (EIO) */
__u64 csum_errors; /* # of failed csum checks */
__u64 verify_errors; /* # of occurences, where the metadata
* of a tree block did not match the
* expected values, like generation or
* logical */
__u64 no_csum; /* # of 4k data block for which no csum
* is present, probably the result of
* data written with nodatasum */
__u64 csum_discards; /* # of csum for which no data was found
* in the extent tree. */
__u64 super_errors; /* # of bad super blocks encountered */
__u64 malloc_errors; /* # of internal kmalloc errors. These
* will likely cause an incomplete
* scrub */
__u64 uncorrectable_errors; /* # of errors where either no intact
* copy was found or the writeback
* failed */
__u64 corrected_errors; /* # of errors corrected */
__u64 last_physical; /* last physical address scrubbed. In
* case a scrub was aborted, this can
* be used to restart the scrub */
__u64 unverified_errors; /* # of occurences where a read for a
* full (64k) bio failed, but the re-
* check succeeded for each 4k piece.
* Intermittent error. */
};
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;

View File

@ -4242,7 +4242,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list);
disk_bytenr + len - 1, &list, 0);
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);

1492
fs/btrfs/scrub.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1321,6 +1321,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
WARN_ON(cur_trans != trans->transaction);
btrfs_scrub_pause(root);
/* btrfs_commit_tree_roots is responsible for getting the
* various roots consistent with each other. Every pointer
* in the tree of tree roots has to point to the most up to date
@ -1405,6 +1406,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_scrub_continue(root);
if (current->journal_info == trans)
current->journal_info = NULL;

View File

@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
&ordered_sums);
&ordered_sums, 0);
BUG_ON(ret);
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
@ -2093,7 +2093,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
btrfs_scrub_pause_super(root);
write_ctree_super(trans, root->fs_info->tree_root, 1);
btrfs_scrub_continue_super(root);
ret = 0;
mutex_lock(&root->log_mutex);
@ -2689,7 +2691,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(
log->fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums);
&ordered_sums, 0);
BUG_ON(ret);
}
}

View File

@ -38,9 +38,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@ -1334,6 +1331,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
goto error_undo;
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root, device);
/*
* the device list mutex makes sure that we don't change

View File

@ -85,6 +85,9 @@ struct btrfs_device {
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
/* per-device scrub information */
struct scrub_dev *scrub_device;
struct btrfs_work work;
};
@ -157,6 +160,9 @@ struct map_lookup {
struct btrfs_bio_stripe stripes[];
};
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
/* Used to sort the devices by max_avail(descending sort) */
int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);