From 2a162ce93232eb78124601996744f8eafec845ab Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Mon, 6 Apr 2015 22:09:15 -0700 Subject: [PATCH 1/8] Btrfs: Improve FL_KEEP_SIZE handling in fallocate - We call inode_size_ok() only if FL_KEEP_SIZE isn't specified. - As an optimisation we can skip the call if (off + len) isn't greater than the current size of the file. This operation is called under the lock so the less work we do, the better. - If we call inode_size_ok() pass to it the correct value rather than a more conservative estimation. Signed-off-by: Davide Italiano Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15a09cb156ce..7af1abda68d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2682,9 +2682,12 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; inode_lock(inode); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } /* * TODO: Move these two operations after we have checked From 264813acb1c756aebc337b16b832604a0c9aadaf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 21 Mar 2016 14:59:53 -0700 Subject: [PATCH 2/8] Btrfs: fix invalid reference in replace_path Dan Carpenter's static checker has found this error, it's introduced by commit 64c043de466d ("Btrfs: fix up read_tree_block to return proper error") It's really supposed to 'break' the loop on error like others. Cc: Dan Carpenter Reported-by: Dan Carpenter Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2bd0011450df..5c806f0d443d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1850,6 +1850,7 @@ again: eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (IS_ERR(eb)) { ret = PTR_ERR(eb); + break; } else if (!extent_buffer_uptodate(eb)) { ret = -EIO; free_extent_buffer(eb); From 0305bc279317a6ba261a663cc32721d78e6544cf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 23 Mar 2016 11:38:17 +0800 Subject: [PATCH 3/8] btrfs: Output more info for enospc_debug mount option As one user in mail list report reproducible balance ENOSPC error, it's better to add more debug info for enospc_debug mount option. Reported-by: Marc Haber Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53e12977bfd0..85074845c562 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9386,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 dev_min = 1; u64 dev_nr = 0; u64 target; + int debug; int index; int full = 0; int ret = 0; + debug = btrfs_test_opt(root, ENOSPC_DEBUG); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ - if (!block_group) + if (!block_group) { + if (debug) + btrfs_warn(root->fs_info, + "can't find block group for bytenr %llu", + bytenr); return -1; + } min_free = btrfs_block_group_used(&block_group->item); @@ -9448,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * this is just a balance, so if we were marked as full * we know there is no space for a new chunk */ - if (full) + if (full) { + if (debug) + btrfs_warn(root->fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; + } index = get_block_group_index(block_group); } @@ -9496,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = -1; } } + if (debug && ret == -1) + btrfs_warn(root->fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); mutex_unlock(&root->fs_info->chunk_mutex); btrfs_end_transaction(trans, root); out: From 918c2ee103cf9956f1c61d3f848dbb49fd2d104a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 30 Mar 2016 17:57:48 -0700 Subject: [PATCH 4/8] btrfs: handle non-fatal errors in btrfs_qgroup_inherit() create_pending_snapshot() will go readonly on _any_ error return from btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by just making a snapshot and asking it to inherit from an invalid qgroup. For example: $ btrfs sub snap -i 1/10 /btrfs/ /btrfs/foo Will cause a transaction abort. Fix this by only throwing errors in btrfs_qgroup_inherit() when we know going readonly is acceptable. The following xfstests test case reproduces this bug: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { cd / rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # remove previous $seqres.full before test rm -f $seqres.full # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch rm -f $seqres.full _scratch_mkfs _scratch_mount _run_btrfs_util_prog quota enable $SCRATCH_MNT # The qgroup '1/10' does not exist and should be silently ignored _run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1 _scratch_unmount echo "Silence is golden" status=0 exit Signed-off-by: Mark Fasheh Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 54 ++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..7173360eea7a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1842,8 +1842,10 @@ out: } /* - * copy the acounting information between qgroups. This is necessary when a - * snapshot or a subvolume is created + * Copy the acounting information between qgroups. This is necessary + * when a snapshot or a subvolume is created. Throwing an error will + * cause a transaction abort so we take extra care here to only error + * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, @@ -1873,15 +1875,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 2 * inherit->num_excl_copies; for (i = 0; i < nums; ++i) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); - if (!srcgroup) { - ret = -EINVAL; - goto out; - } - if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { - ret = -EINVAL; - goto out; - } + /* + * Zero out invalid groups so we can ignore + * them later. + */ + if (!srcgroup || + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) + *i_qgroups = 0ULL; + ++i_qgroups; } } @@ -1916,17 +1918,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + if (*i_qgroups == 0) + continue; ret = add_qgroup_relation_item(trans, quota_root, objectid, *i_qgroups); - if (ret) + if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, quota_root, *i_qgroups, objectid); - if (ret) + if (ret && ret != -EEXIST) goto out; - ++i_qgroups; } + ret = 0; } @@ -1987,17 +1991,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { - ret = add_relation_rb(quota_root->fs_info, objectid, - *i_qgroups); - if (ret) - goto unlock; + if (*i_qgroups) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + } ++i_qgroups; } - for (i = 0; i < inherit->num_ref_copies; ++i) { + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2008,12 +2017,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; - i_qgroups += 2; } - for (i = 0; i < inherit->num_excl_copies; ++i) { + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2024,7 +2035,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; - i_qgroups += 2; } unlock: From 8f282f71eaee7ac979cdbe525f76daa0722798a8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 30 Mar 2016 16:01:12 +0200 Subject: [PATCH 5/8] btrfs: fallback to vmalloc in btrfs_compare_tree The allocation of node could fail if the memory is too fragmented for a given node size, practically observed with 64k. http://article.gmane.org/gmane.comp.file-systems.btrfs/54689 Reported-and-tested-by: Jean-Denis Girard Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 77592931ab4f..ec7928a27aaa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!tmp_buf) { - ret = -ENOMEM; - goto out; + tmp_buf = vmalloc(left_root->nodesize); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } } left_path->search_commit_root = 1; @@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, out: btrfs_free_path(left_path); btrfs_free_path(right_path); - kfree(tmp_buf); + kvfree(tmp_buf); return ret; } From c79b4713304f812d3d6c95826fc3e5fc2c0b0c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 25 Mar 2016 10:02:41 -0400 Subject: [PATCH 6/8] Btrfs: don't use src fd for printk The fd we pass in may not be on a btrfs file system, so don't try to do BTRFS_I() on it. Thanks, Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 053e677839fe..21423dd15da4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1654,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { From 0f5dcf8de974db5970d48d12a202997baa2846a1 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 29 Mar 2016 17:19:55 -0700 Subject: [PATCH 7/8] btrfs: Add qgroup tracing This patch adds tracepoints to the qgroup code on both the reporting side (insert_dirty_extents) and the accounting side. Taken together it allows us to see what qgroup operations have happened, and what their result was. Signed-off-by: Mark Fasheh Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 ++++ include/trace/events/btrfs.h | 89 +++++++++++++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7173360eea7a..9e119552ed32 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); + trace_btrfs_qgroup_insert_dirty_extent(record); while (*p) { parent_node = *p; @@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + trace_qgroup_update_counters(qg->qgroupid, cur_old_count, + cur_new_count); + /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; @@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); + trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, + nr_new_roots); + qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { ret = -ENOMEM; @@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + trace_btrfs_qgroup_account_extents(record); + if (!ret) { /* * Use (u64)-1 as time_seq to do special search, which diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index d866f21efbbf..467a4d205ff6 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -23,7 +23,7 @@ struct map_lookup; struct extent_buffer; struct btrfs_work; struct __btrfs_workqueue; -struct btrfs_qgroup_operation; +struct btrfs_qgroup_extent_record; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref, TP_ARGS(ref_root, reserved) ); + +DECLARE_EVENT_CLASS(btrfs_qgroup_extent, + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + ), + + TP_fast_assign( + __entry->bytenr = rec->bytenr, + __entry->num_bytes = rec->num_bytes; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu", + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->num_bytes) +); + +DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, + + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec) +); + +DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent, + + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec) +); + +TRACE_EVENT(btrfs_qgroup_account_extent, + + TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), + + TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + __field( u64, nr_old_roots ) + __field( u64, nr_new_roots ) + ), + + TP_fast_assign( + __entry->bytenr = bytenr; + __entry->num_bytes = num_bytes; + __entry->nr_old_roots = nr_old_roots; + __entry->nr_new_roots = nr_new_roots; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " + "nr_new_roots = %llu", + __entry->bytenr, + __entry->num_bytes, + __entry->nr_old_roots, + __entry->nr_new_roots) +); + +TRACE_EVENT(qgroup_update_counters, + + TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count), + + TP_ARGS(qgid, cur_old_count, cur_new_count), + + TP_STRUCT__entry( + __field( u64, qgid ) + __field( u64, cur_old_count ) + __field( u64, cur_new_count ) + ), + + TP_fast_assign( + __entry->qgid = qgid; + __entry->cur_old_count = cur_old_count; + __entry->cur_new_count = cur_new_count; + ), + + TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", + __entry->qgid, + __entry->cur_old_count, + __entry->cur_new_count) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ From 7ccefb98ce3e5c4493cd213cd03714b7149cf0cb Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 29 Mar 2016 14:17:48 -0700 Subject: [PATCH 8/8] btrfs: Reset IO error counters before start of device replacing If device replace entry was found on disk at mounting and its num_write_errors stats counter has non-NULL value, then replace operation will never be finished and -EIO error will be reported by btrfs_scrub_dev() because this counter is never reset. # mount -o degraded /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ # btrfs replace status /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ Started on 25.Mar 07:28:00, canceled on 25.Mar 07:28:01 at 0.0%, 40 write errs, 0 uncorr. read errs # btrfs replace start -B 4 /dev/sdg /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ ERROR: ioctl(DEV_REPLACE_START) failed on "/media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/": Input/output error, no error Reset num_write_errors and num_uncorrectable_read_errors counters in the dev_replace structure before start of replacing. Signed-off-by: Yauhen Kharuzhy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a1d6652e0c47..26bcb487f958 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -394,6 +394,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->cursor_right = 0; dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace, 1);