Btrfs: deal with convert_extent_bit errors to avoid fs corruption
When committing a transaction or a log, we look for btree extents that need to be durably persisted by searching for ranges in a io tree that have some bits set (EXTENT_DIRTY or EXTENT_NEW). We then attempt to clear those bits and set the EXTENT_NEED_WAIT bit, with calls to the function convert_extent_bit, and then start writeback for the extents. That function however can return an error (at the moment only -ENOMEM is possible, specially when it does GFP_ATOMIC allocation requests through alloc_extent_state_atomic) - that means the ranges didn't got the EXTENT_NEED_WAIT bit set (or at least not for the whole range), which in turn means a call to btrfs_wait_marked_extents() won't find those ranges for which we started writeback, causing a transaction commit or a log commit to persist a new superblock without waiting for the writeback of extents in that range to finish first. Therefore if a crash happens after persisting the new superblock and before writeback finishes, we have a superblock pointing to roots that weren't fully persisted or roots that point to nodes or leafs that weren't fully persisted, causing all sorts of unexpected/bad behaviour as we endup reading garbage from disk or the content of some node/leaf from a past generation that got cowed or deleted and is no longer valid (for this later case we end up getting error messages like "parent transid verify failed on X wanted Y found Z" when reading btree nodes/leafs from disk). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
This commit is contained in:
		
							parent
							
								
									2fc9f6baa2
								
							
						
					
					
						commit
						663dfbb077
					
				| @ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void clear_btree_io_tree(struct extent_io_tree *tree) | ||||
| { | ||||
| 	spin_lock(&tree->lock); | ||||
| 	while (!RB_EMPTY_ROOT(&tree->state)) { | ||||
| 		struct rb_node *node; | ||||
| 		struct extent_state *state; | ||||
| 
 | ||||
| 		node = rb_first(&tree->state); | ||||
| 		state = rb_entry(node, struct extent_state, rb_node); | ||||
| 		rb_erase(&state->rb_node, &tree->state); | ||||
| 		RB_CLEAR_NODE(&state->rb_node); | ||||
| 		/*
 | ||||
| 		 * btree io trees aren't supposed to have tasks waiting for | ||||
| 		 * changes in the flags of extent states ever. | ||||
| 		 */ | ||||
| 		ASSERT(!waitqueue_active(&state->wq)); | ||||
| 		free_extent_state(state); | ||||
| 		if (need_resched()) { | ||||
| 			spin_unlock(&tree->lock); | ||||
| 			cond_resched(); | ||||
| 			spin_lock(&tree->lock); | ||||
| 		} | ||||
| 	} | ||||
| 	spin_unlock(&tree->lock); | ||||
| } | ||||
| 
 | ||||
| static noinline void switch_commit_roots(struct btrfs_transaction *trans, | ||||
| 					 struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| @ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, | ||||
| 		root->commit_root = btrfs_root_node(root); | ||||
| 		if (is_fstree(root->objectid)) | ||||
| 			btrfs_unpin_free_ino(root); | ||||
| 		clear_btree_io_tree(&root->dirty_log_pages); | ||||
| 	} | ||||
| 	up_write(&fs_info->commit_root_sem); | ||||
| } | ||||
| @ -828,17 +855,38 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | ||||
| 
 | ||||
| 	while (!find_first_extent_bit(dirty_pages, start, &start, &end, | ||||
| 				      mark, &cached_state)) { | ||||
| 		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | ||||
| 				   mark, &cached_state, GFP_NOFS); | ||||
| 		cached_state = NULL; | ||||
| 		err = filemap_fdatawrite_range(mapping, start, end); | ||||
| 		bool wait_writeback = false; | ||||
| 
 | ||||
| 		err = convert_extent_bit(dirty_pages, start, end, | ||||
| 					 EXTENT_NEED_WAIT, | ||||
| 					 mark, &cached_state, GFP_NOFS); | ||||
| 		/*
 | ||||
| 		 * convert_extent_bit can return -ENOMEM, which is most of the | ||||
| 		 * time a temporary error. So when it happens, ignore the error | ||||
| 		 * and wait for writeback of this range to finish - because we | ||||
| 		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call | ||||
| 		 * to btrfs_wait_marked_extents() would not know that writeback | ||||
| 		 * for this range started and therefore wouldn't wait for it to | ||||
| 		 * finish - we don't want to commit a superblock that points to | ||||
| 		 * btree nodes/leafs for which writeback hasn't finished yet | ||||
| 		 * (and without errors). | ||||
| 		 * We cleanup any entries left in the io tree when committing | ||||
| 		 * the transaction (through clear_btree_io_tree()). | ||||
| 		 */ | ||||
| 		if (err == -ENOMEM) { | ||||
| 			err = 0; | ||||
| 			wait_writeback = true; | ||||
| 		} | ||||
| 		if (!err) | ||||
| 			err = filemap_fdatawrite_range(mapping, start, end); | ||||
| 		if (err) | ||||
| 			werr = err; | ||||
| 		else if (wait_writeback) | ||||
| 			werr = filemap_fdatawait_range(mapping, start, end); | ||||
| 		cached_state = NULL; | ||||
| 		cond_resched(); | ||||
| 		start = end + 1; | ||||
| 	} | ||||
| 	if (err) | ||||
| 		werr = err; | ||||
| 	return werr; | ||||
| } | ||||
| 
 | ||||
| @ -862,9 +910,21 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, | ||||
| 
 | ||||
| 	while (!find_first_extent_bit(dirty_pages, start, &start, &end, | ||||
| 				      EXTENT_NEED_WAIT, &cached_state)) { | ||||
| 		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | ||||
| 				 0, 0, &cached_state, GFP_NOFS); | ||||
| 		err = filemap_fdatawait_range(mapping, start, end); | ||||
| 		/*
 | ||||
| 		 * Ignore -ENOMEM errors returned by clear_extent_bit(). | ||||
| 		 * When committing the transaction, we'll remove any entries | ||||
| 		 * left in the io tree. For a log commit, we don't remove them | ||||
| 		 * after committing the log because the tree can be accessed | ||||
| 		 * concurrently - we do it only at transaction commit time when | ||||
| 		 * it's safe to do it (through clear_btree_io_tree()). | ||||
| 		 */ | ||||
| 		err = clear_extent_bit(dirty_pages, start, end, | ||||
| 				       EXTENT_NEED_WAIT, | ||||
| 				       0, 0, &cached_state, GFP_NOFS); | ||||
| 		if (err == -ENOMEM) | ||||
| 			err = 0; | ||||
| 		if (!err) | ||||
| 			err = filemap_fdatawait_range(mapping, start, end); | ||||
| 		if (err) | ||||
| 			werr = err; | ||||
| 		cond_resched(); | ||||
| @ -919,17 +979,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||||
| static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||||
| 				     struct btrfs_root *root) | ||||
| { | ||||
| 	if (!trans || !trans->transaction) { | ||||
| 		struct inode *btree_inode; | ||||
| 		btree_inode = root->fs_info->btree_inode; | ||||
| 		return filemap_write_and_wait(btree_inode->i_mapping); | ||||
| 	} | ||||
| 	return btrfs_write_and_wait_marked_extents(root, | ||||
| 	int ret; | ||||
| 
 | ||||
| 	ret = btrfs_write_and_wait_marked_extents(root, | ||||
| 					   &trans->transaction->dirty_pages, | ||||
| 					   EXTENT_DIRTY); | ||||
| 	clear_btree_io_tree(&trans->transaction->dirty_pages); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | ||||
| @ -145,8 +145,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( | ||||
| 					struct btrfs_root *root); | ||||
| struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); | ||||
| int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); | ||||
| int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||||
| 				     struct btrfs_root *root); | ||||
| 
 | ||||
| void btrfs_add_dead_root(struct btrfs_root *root); | ||||
| int btrfs_defrag_root(struct btrfs_root *root); | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user