linux/fs/btrfs/volumes.c

6754 lines
173 KiB
C
Raw Normal View History

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/sched.h>
#include <linux/bio.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
struct list_head *btrfs_get_fs_uuids(void)
{
return &fs_uuids;
}
static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
mutex_init(&fs_devs->device_list_mutex);
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->resized_devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->list);
return fs_devs;
}
/**
* alloc_fs_devices - allocate struct btrfs_fs_devices
* @fsid: a pointer to UUID for this FS. If NULL a new UUID is
* generated.
*
* Return: a pointer to a new &struct btrfs_fs_devices on success;
* ERR_PTR() on error. Returned struct is not linked onto any lists and
* can be destroyed with kfree() right away.
*/
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
{
struct btrfs_fs_devices *fs_devs;
fs_devs = __alloc_fs_devices();
if (IS_ERR(fs_devs))
return fs_devs;
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
else
generate_random_uuid(fs_devs->fsid);
return fs_devs;
}
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
while (!list_empty(&fs_devices->devices)) {
device = list_entry(fs_devices->devices.next,
struct btrfs_device, dev_list);
list_del(&device->dev_list);
rcu_string_free(device->name);
kfree(device);
}
kfree(fs_devices);
}
static void btrfs_kobject_uevent(struct block_device *bdev,
enum kobject_action action)
{
int ret;
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
action,
kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
void btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
fs_devices = list_entry(fs_uuids.next,
struct btrfs_fs_devices, list);
list_del(&fs_devices->list);
free_fs_devices(fs_devices);
}
}
static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
dev = kzalloc(sizeof(*dev), GFP_NOFS);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&dev->dev_list);
INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->resized_list);
spin_lock_init(&dev->io_lock);
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
return dev;
}
static noinline struct btrfs_device *__find_device(struct list_head *head,
u64 devid, u8 *uuid)
{
struct btrfs_device *dev;
list_for_each_entry(dev, head, dev_list) {
if (dev->devid == devid &&
(!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
return dev;
}
}
return NULL;
}
static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
list_for_each_entry(fs_devices, &fs_uuids, list) {
if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
return fs_devices;
}
return NULL;
}
static int
btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
int flush, struct block_device **bdev,
struct buffer_head **bh)
{
int ret;
*bdev = blkdev_get_by_path(device_path, flags, holder);
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
if (flush)
filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
ret = set_blocksize(*bdev, 4096);
if (ret) {
blkdev_put(*bdev, flags);
goto error;
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
if (!*bh) {
ret = -EINVAL;
blkdev_put(*bdev, flags);
goto error;
}
return 0;
error:
*bdev = NULL;
*bh = NULL;
return ret;
}
static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{
struct bio *old_head;
old_head = pending_bios->head;
pending_bios->head = head;
if (pending_bios->tail)
tail->bi_next = old_head;
else
pending_bios->tail = tail;
}
/*
* we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly
* improves the schedulers ability to collect and merge the bios.
*
* But, it also turns into a long list of bios to process and that is sure
* to eventually make the worker thread block. The solution here is to
* make some progress and then put this work struct back at the end of
* the list if the block device is congested. This way, multiple devices
* can make progress from a single worker thread.
*/
static noinline void run_scheduled_bios(struct btrfs_device *device)
{
struct bio *pending;
struct backing_dev_info *bdi;
struct btrfs_fs_info *fs_info;
struct btrfs_pending_bios *pending_bios;
struct bio *tail;
struct bio *cur;
int again = 0;
unsigned long num_run;
unsigned long batch_run = 0;
unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
int sync_pending = 0;
struct blk_plug plug;
/*
* this function runs all the bios we've collected for
* a particular device. We don't want to wander off to
* another device without first sending all of these down.
* So, setup a plug here and finish it off before we return
*/
blk_start_plug(&plug);
bdi = blk_get_backing_dev_info(device->bdev);
fs_info = device->dev_root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
loop:
spin_lock(&device->io_lock);
loop_lock:
num_run = 0;
/* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion
*/
if (!force_reg && device->pending_sync_bios.head) {
pending_bios = &device->pending_sync_bios;
force_reg = 1;
} else {
pending_bios = &device->pending_bios;
force_reg = 0;
}
pending = pending_bios->head;
tail = pending_bios->tail;
WARN_ON(pending && !tail);
/*
* if pending was null this time around, no bios need processing
* at all and we can stop. Otherwise it'll loop back up again
* and do an additional check so no bios are missed.
*
* device->running_pending is used to synchronize with the
* schedule_bio code.
*/
if (device->pending_sync_bios.head == NULL &&
device->pending_bios.head == NULL) {
again = 0;
device->running_pending = 0;
} else {
again = 1;
device->running_pending = 1;
}
pending_bios->head = NULL;
pending_bios->tail = NULL;
spin_unlock(&device->io_lock);
while (pending) {
rmb();
/* we want to work on both lists, but do more bios on the
* sync list than the regular list
*/
if ((num_run > 32 &&
pending_bios != &device->pending_sync_bios &&
device->pending_sync_bios.head) ||
(num_run > 64 && pending_bios == &device->pending_sync_bios &&
device->pending_bios.head)) {
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
goto loop_lock;
}
cur = pending;
pending = pending->bi_next;
cur->bi_next = NULL;
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
/*
* if we're doing the sync list, record that our
* plug has some sync requests on it
*
* If we're doing the regular list and there are
* sync requests sitting around, unplug before
* we add more
*/
if (pending_bios == &device->pending_sync_bios) {
sync_pending = 1;
} else if (sync_pending) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
cond_resched();
/*
* we made progress, there is more work to do and the bdi
* is now congested. Back off and let other work structs
* run instead
*/
if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
ioc = current->io_context;
/*
* the main goal here is that we don't want to
* block if we're going to be able to submit
* more requests without blocking.
*
* This code does two great things, it pokes into
* the elevator code from a filesystem _and_
* it makes assumptions about how batching works.
*/
if (ioc && ioc->nr_batch_requests > 0 &&
time_before(jiffies, ioc->last_waited + HZ/50UL) &&
(last_waited == 0 ||
ioc->last_waited == last_waited)) {
/*
* we want to go through our batch of
* requests and stop. So, we copy out
* the ioc->last_waited time and test
* against it before looping
*/
last_waited = ioc->last_waited;
cond_resched();
continue;
}
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
device->running_pending = 1;
spin_unlock(&device->io_lock);
btrfs_queue_work(fs_info->submit_workers,
&device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
if (batch_run % 64 == 0) {
blk_finish_plug(&plug);
blk_start_plug(&plug);
sync_pending = 0;
}
}
cond_resched();
if (again)
goto loop;
spin_lock(&device->io_lock);
if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock;
spin_unlock(&device->io_lock);
done:
blk_finish_plug(&plug);
}
static void pending_bios_fn(struct btrfs_work *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, work);
run_scheduled_bios(device);
}
/*
* Add new device to list of registered devices
*
* Returns:
* 1 - first time device is seen
* 0 - device already known
* < 0 - error
*/
static noinline int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
fs_devices = find_fsid(disk_super->fsid);
if (!fs_devices) {
fs_devices = alloc_fs_devices(disk_super->fsid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
list_add(&fs_devices->list, &fs_uuids);
device = NULL;
} else {
device = __find_device(&fs_devices->devices, devid,
disk_super->dev_item.uuid);
}
if (!device) {
if (fs_devices->opened)
return -EBUSY;
device = btrfs_alloc_device(NULL, &devid,
disk_super->dev_item.uuid);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
return PTR_ERR(device);
}
name = rcu_string_strdup(path, GFP_NOFS);
if (!name) {
kfree(device);
return -ENOMEM;
}
rcu_assign_pointer(device->name, name);
mutex_lock(&fs_devices->device_list_mutex);
list_add_rcu(&device->dev_list, &fs_devices->devices);
Btrfs: fix race conditions in BTRFS_IOC_FS_INFO ioctl The handler for the ioctl BTRFS_IOC_FS_INFO was reading the number of devices before acquiring the device list mutex. This could lead to inconsistent results because the update of the device list and the number of devices counter (amongst other counters related to the device list) are updated in volumes.c while holding the device list mutex - except for 2 places, one was volumes.c:btrfs_prepare_sprout() and the other was volumes.c:device_list_add(). For example, if we have 2 devices, with IDs 1 and 2 and then add a new device, with ID 3, and while adding the device is in progress an BTRFS_IOC_FS_INFO ioctl arrives, it could return a number of devices of 2 and a max dev id of 3. This would be incorrect. Also, this ioctl handler was reading the fsid while it can be updated concurrently. This can happen when while a new device is being added and the current filesystem is in seeding mode. Example: $ mkfs.btrfs -f /dev/sdb1 $ mkfs.btrfs -f /dev/sdb2 $ btrfstune -S 1 /dev/sdb1 $ mount /dev/sdb1 /mnt/test $ btrfs device add /dev/sdb2 /mnt/test If during the last step a BTRFS_IOC_FS_INFO ioctl was requested, it could read an fsid that was never valid (some bits part of the old fsid and others part of the new fsid). Also, it could read a number of devices that doesn't match the number of devices in the list and the max device id, as explained before. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-12 19:56:58 +00:00
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 10:22:05 +00:00
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
* means this device was missing at time of FS mount.
* 2. If you are here and if the device->name is different
* from 'path' that means either
* a. The same device disappeared and reappeared with
* different name. or
* b. The missing-disk-which-was-replaced, has
* reappeared now.
*
* We must allow 1 and 2a above. But 2b would be a spurious
* and unintentional.
*
* Further in case of 1 and 2a above, the disk at 'path'
* would have missed some transaction when it was away and
* in case of 2a the stale bdev has to be updated as well.
* 2b must not be allowed at all time.
*/
/*
* For now, we do allow update to btrfs_fs_device through the
* btrfs dev scan cli after FS has been mounted. We're still
* tracking a problem where systems fail mount by subvolume id
* when we reject replacement on a mounted FS.
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 10:22:05 +00:00
*/
if (!fs_devices->opened && found_transid < device->generation) {
/*
* That is if the FS is _not_ mounted and if you
* are here, that means there is more than one
* disk with same uuid and devid.We keep the one
* with larger generation number or the last-in if
* generation are equal.
*/
return -EEXIST;
}
Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-07-03 10:22:05 +00:00
name = rcu_string_strdup(path, GFP_NOFS);
if (!name)
return -ENOMEM;
rcu_string_free(device->name);
rcu_assign_pointer(device->name, name);
if (device->missing) {
fs_devices->missing_devices--;
device->missing = 0;
}
}
/*
* Unmount does not free the btrfs_device struct but would zero
* generation along with most of the other members. So just update
* it back. We need it to pick the disk with largest generation
* (as above).
*/
if (!fs_devices->opened)
device->generation = found_transid;
*fs_devices_ret = fs_devices;
return ret;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
struct btrfs_device *orig_dev;
fs_devices = alloc_fs_devices(orig->fsid);
if (IS_ERR(fs_devices))
return fs_devices;
mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
/* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
struct rcu_string *name;
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid);
if (IS_ERR(device))
goto error;
/*
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name) {
name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
if (!name) {
kfree(device);
goto error;
}
rcu_assign_pointer(device->name, name);
}
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
mutex_lock(&uuid_mutex);
again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata) {
if (!device->is_tgtdev_for_dev_replace &&
(!latest_dev ||
device->generation > latest_dev->generation)) {
latest_dev = device;
}
continue;
}
if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
/*
* In the first step, keep the device which has
* the correct fsid and the devid that is used
* for the dev_replace procedure.
* In the second step, the dev_replace state is
* read from the device tree and it is known
* whether the procedure is really active or
* not, which means whether this device is
* used or whether it should be removed.
*/
if (step == 0 || device->is_tgtdev_for_dev_replace) {
continue;
}
}
if (device->bdev) {
blkdev_put(device->bdev, device->mode);
device->bdev = NULL;
fs_devices->open_devices--;
}
if (device->writeable) {
list_del_init(&device->dev_alloc_list);
device->writeable = 0;
if (!device->is_tgtdev_for_dev_replace)
fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
rcu_string_free(device->name);
kfree(device);
}
if (fs_devices->seed) {
fs_devices = fs_devices->seed;
goto again;
}
fs_devices->latest_bdev = latest_dev->bdev;
mutex_unlock(&uuid_mutex);
}
static void __free_device(struct work_struct *work)
{
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, rcu_work);
if (device->bdev)
blkdev_put(device->bdev, device->mode);
rcu_string_free(device->name);
kfree(device);
}
static void free_device(struct rcu_head *head)
{
struct btrfs_device *device;
device = container_of(head, struct btrfs_device, rcu);
INIT_WORK(&device->rcu_work, __free_device);
schedule_work(&device->rcu_work);
}
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
if (--fs_devices->opened > 0)
return 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
struct btrfs_device *new_device;
struct rcu_string *name;
if (device->bdev)
fs_devices->open_devices--;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
if (device->missing)
fs_devices->missing_devices--;
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
/* Safe because we are under uuid_mutex */
if (device->name) {
name = rcu_string_strdup(device->name->str, GFP_NOFS);
BUG_ON(!name); /* -ENOMEM */
rcu_assign_pointer(new_device->name, name);
}
list_replace_rcu(&device->dev_list, &new_device->dev_list);
new_device->fs_devices = device->fs_devices;
call_rcu(&device->rcu, free_device);
}
mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = 0;
return 0;
}
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_fs_devices *seed_devices = NULL;
int ret;
mutex_lock(&uuid_mutex);
ret = __btrfs_close_devices(fs_devices);
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
}
mutex_unlock(&uuid_mutex);
while (seed_devices) {
fs_devices = seed_devices;
seed_devices = fs_devices->seed;
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
/*
* Wait for rcu kworkers under __btrfs_close_devices
* to finish all blkdev_puts so device is really
* free when umount is done.
*/
rcu_barrier();
return ret;
}
static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
struct request_queue *q;
struct block_device *bdev;
struct list_head *head = &fs_devices->devices;
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
u64 devid;
int seeding = 1;
int ret = 0;
flags |= FMODE_EXCL;
list_for_each_entry(device, head, dev_list) {
if (device->bdev)
continue;
if (!device->name)
continue;
/* Just open everything we can; ignore failures here */
if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
&bdev, &bh))
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
if (devid != device->devid)
goto error_brelse;
if (memcmp(device->uuid, disk_super->dev_item.uuid,
BTRFS_UUID_SIZE))
goto error_brelse;
device->generation = btrfs_super_generation(disk_super);
if (!latest_dev ||
device->generation > latest_dev->generation)
latest_dev = device;
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
device->writeable = 0;
} else {
device->writeable = !bdev_read_only(bdev);
seeding = 0;
}
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
device->bdev = bdev;
device->in_fs_metadata = 0;
device->mode = flags;
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = 1;
fs_devices->open_devices++;
if (device->writeable &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
fs_devices->rw_devices++;
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
}
brelse(bh);
continue;
error_brelse:
brelse(bh);
blkdev_put(bdev, flags);
continue;
}
if (fs_devices->open_devices == 0) {
ret = -EINVAL;
goto out;
}
fs_devices->seeding = seeding;
fs_devices->opened = 1;
fs_devices->latest_bdev = latest_dev->bdev;
fs_devices->total_rw_bytes = 0;
out:
return ret;
}
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
int ret;
mutex_lock(&uuid_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
} else {
ret = __btrfs_open_devices(fs_devices, flags, holder);
}
mutex_unlock(&uuid_mutex);
return ret;
}
/*
* Look for a btrfs signature on a device. This may be called out of the mount path
* and we are not allowed to call set_blocksize during the scan. The superblock
* is read via pagecache
*/
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret)
{
struct btrfs_super_block *disk_super;
struct block_device *bdev;
struct page *page;
void *p;
int ret = -EINVAL;
u64 devid;
u64 transid;
u64 total_devices;
u64 bytenr;
pgoff_t index;
/*
* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
bytenr = btrfs_sb_offset(0);
flags |= FMODE_EXCL;
mutex_lock(&uuid_mutex);
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev)) {
ret = PTR_ERR(bdev);
goto error;
}
/* make sure our super fits in the device */
if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
goto error_bdev_put;
/* make sure our super fits in the page */
if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
goto error_bdev_put;
/* make sure our super doesn't straddle pages on disk */
index = bytenr >> PAGE_CACHE_SHIFT;
if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
goto error_bdev_put;
/* pull in the page with our super */
page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
index, GFP_NOFS);
if (IS_ERR_OR_NULL(page))
goto error_bdev_put;
p = kmap(page);
/* align our pointer to the offset of the super block */
disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
if (btrfs_super_bytenr(disk_super) != bytenr ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC)
goto error_unmap;
devid = btrfs_stack_device_id(&disk_super->dev_item);
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
if (ret > 0) {
if (disk_super->label[0]) {
if (disk_super->label[BTRFS_LABEL_SIZE - 1])
disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
} else {
printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
}
printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
ret = 0;
}
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
error_unmap:
kunmap(page);
page_cache_release(page);
error_bdev_put:
blkdev_put(bdev, flags);
error:
mutex_unlock(&uuid_mutex);
return ret;
}
btrfs: fix wrong free space information of btrfs When we store data by raid profile in btrfs with two or more different size disks, df command shows there is some free space in the filesystem, but the user can not write any data in fact, df command shows the wrong free space information of btrfs. # mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10 # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 28.00KB devid 1 size 5.01GB used 2.03GB path /dev/sda9 devid 2 size 10.00GB used 2.01GB path /dev/sda10 # btrfs device scan /dev/sda9 /dev/sda10 # mount /dev/sda9 /mnt # dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999 (fill the filesystem) # sync # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 3.99GB devid 1 size 5.01GB used 5.01GB path /dev/sda9 devid 2 size 10.00GB used 4.99GB path /dev/sda10 It is because btrfs cannot allocate chunks when one of the pairing disks has no space, the free space on the other disks can not be used for ever, and should be subtracted from the total space, but btrfs doesn't subtract this space from the total. It is strange to the user. This patch fixes it by calcing the free space that can be used to allocate chunks. Implementation: 1. get all the devices free space, and align them by stripe length. 2. sort the devices by the free space. 3. check the free space of the devices, 3.1. if it is not zero, and then check the number of the devices that has more free space than this device, if the number of the devices is beyond the min stripe number, the free space can be used, and add into total free space. if the number of the devices is below the min stripe number, we can not use the free space, the check ends. 3.2. if the free space is zero, check the next devices, goto 3.1 This implementation is just likely fake chunk allocation. After appling this patch, df can show correct space information: # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 0 100% /mnt Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 10:07:31 +00:00
/* helper to account the used device space in the range */
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 extent_end;
int ret;
int slot;
struct extent_buffer *l;
*length = 0;
if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
btrfs: fix wrong free space information of btrfs When we store data by raid profile in btrfs with two or more different size disks, df command shows there is some free space in the filesystem, but the user can not write any data in fact, df command shows the wrong free space information of btrfs. # mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10 # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 28.00KB devid 1 size 5.01GB used 2.03GB path /dev/sda9 devid 2 size 10.00GB used 2.01GB path /dev/sda10 # btrfs device scan /dev/sda9 /dev/sda10 # mount /dev/sda9 /mnt # dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999 (fill the filesystem) # sync # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 3.99GB devid 1 size 5.01GB used 5.01GB path /dev/sda9 devid 2 size 10.00GB used 4.99GB path /dev/sda10 It is because btrfs cannot allocate chunks when one of the pairing disks has no space, the free space on the other disks can not be used for ever, and should be subtracted from the total space, but btrfs doesn't subtract this space from the total. It is strange to the user. This patch fixes it by calcing the free space that can be used to allocate chunks. Implementation: 1. get all the devices free space, and align them by stripe length. 2. sort the devices by the free space. 3. check the free space of the devices, 3.1. if it is not zero, and then check the number of the devices that has more free space than this device, if the number of the devices is beyond the min stripe number, the free space can be used, and add into total free space. if the number of the devices is below the min stripe number, we can not use the free space, the check ends. 3.2. if the free space is zero, check the next devices, goto 3.1 This implementation is just likely fake chunk allocation. After appling this patch, df can show correct space information: # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 0 100% /mnt Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 10:07:31 +00:00
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (key.type != BTRFS_DEV_EXTENT_KEY)
btrfs: fix wrong free space information of btrfs When we store data by raid profile in btrfs with two or more different size disks, df command shows there is some free space in the filesystem, but the user can not write any data in fact, df command shows the wrong free space information of btrfs. # mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10 # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 28.00KB devid 1 size 5.01GB used 2.03GB path /dev/sda9 devid 2 size 10.00GB used 2.01GB path /dev/sda10 # btrfs device scan /dev/sda9 /dev/sda10 # mount /dev/sda9 /mnt # dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999 (fill the filesystem) # sync # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 3.99GB devid 1 size 5.01GB used 5.01GB path /dev/sda9 devid 2 size 10.00GB used 4.99GB path /dev/sda10 It is because btrfs cannot allocate chunks when one of the pairing disks has no space, the free space on the other disks can not be used for ever, and should be subtracted from the total space, but btrfs doesn't subtract this space from the total. It is strange to the user. This patch fixes it by calcing the free space that can be used to allocate chunks. Implementation: 1. get all the devices free space, and align them by stripe length. 2. sort the devices by the free space. 3. check the free space of the devices, 3.1. if it is not zero, and then check the number of the devices that has more free space than this device, if the number of the devices is beyond the min stripe number, the free space can be used, and add into total free space. if the number of the devices is below the min stripe number, we can not use the free space, the check ends. 3.2. if the free space is zero, check the next devices, goto 3.1 This implementation is just likely fake chunk allocation. After appling this patch, df can show correct space information: # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 0 100% /mnt Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 10:07:31 +00:00
goto next;
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (key.offset <= start && extent_end > end) {
*length = end - start + 1;
break;
} else if (key.offset <= start && extent_end > start)
*length += extent_end - start;
else if (key.offset > start && extent_end <= end)
*length += extent_end - key.offset;
else if (key.offset > start && key.offset <= end) {
*length += end - key.offset + 1;
break;
} else if (key.offset > end)
break;
next:
path->slots[0]++;
}
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int contains_pending_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 *start, u64 len)
{
struct extent_map *em;
Btrfs: fix race between fs trimming and block group remove/allocation Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-11-27 21:14:15 +00:00
struct list_head *search_list = &trans->transaction->pending_chunks;
int ret = 0;
u64 physical_start = *start;
Btrfs: fix race between fs trimming and block group remove/allocation Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-11-27 21:14:15 +00:00
again:
list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
int i;
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev != device)
continue;
if (map->stripes[i].physical >= physical_start + len ||
map->stripes[i].physical + em->orig_block_len <=
physical_start)
continue;
*start = map->stripes[i].physical +
em->orig_block_len;
ret = 1;
}
}
Btrfs: fix race between fs trimming and block group remove/allocation Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-11-27 21:14:15 +00:00
if (search_list == &trans->transaction->pending_chunks) {
search_list = &trans->root->fs_info->pinned_chunks;
goto again;
}
return ret;
}
/*
* find_free_dev_extent - find free space in the specified device
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
* @len: the size of the free space. that we find, or the size of the max
* free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
* of extents
*
* @start is used to store the start of the free space if we find. But if we
* don't find suitable free space, it will be used to store the start position
* of the max free space.
*
* @len is used to store the size of the free space that we find.
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent;
struct btrfs_path *path;
u64 hole_size;
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
/* FIXME use last free of some kind */
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
max_hole_start = search_start;
max_hole_size = 0;
again:
if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
ret = -ENOSPC;
goto out;
}
path->reada = 2;
path->search_commit_root = 1;
path->skip_locking = 1;
key.objectid = device->devid;
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid, key.type);
if (ret < 0)
goto out;
}
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid < device->devid)
goto next;
if (key.objectid > device->devid)
break;
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
if (key.offset > search_start) {
hole_size = key.offset - search_start;
/*
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
if (contains_pending_extent(trans, device,
&search_start,
hole_size)) {
if (key.offset >= search_start) {
hole_size = key.offset - search_start;
} else {
WARN_ON_ONCE(1);
hole_size = 0;
}
}
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
/*
* If this free space is greater than which we need,
* it must be the max free space that we have found
* until now, so max_hole_start must point to the start
* of this free space and the length of this free space
* is stored in max_hole_size. Thus, we return
* max_hole_start and max_hole_size and go back to the
* caller.
*/
if (hole_size >= num_bytes) {
ret = 0;
goto out;
}
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
extent_end = key.offset + btrfs_dev_extent_length(l,
dev_extent);
if (extent_end > search_start)
search_start = extent_end;
next:
path->slots[0]++;
cond_resched();
}
/*
* At this point, search_start should be the end of
* allocated dev extents, and when shrinking the device,
* search_end may be smaller than search_start.
*/
if (search_end > search_start) {
hole_size = search_end - search_start;
if (contains_pending_extent(trans, device, &search_start,
hole_size)) {
btrfs_release_path(path);
goto again;
}
if (hole_size > max_hole_size) {
max_hole_start = search_start;
max_hole_size = hole_size;
}
}
/* See above. */
if (max_hole_size < num_bytes)
ret = -ENOSPC;
else
ret = 0;
out:
btrfs_free_path(path);
*start = max_hole_start;
if (len)
*len = max_hole_size;
return ret;
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->dev_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *leaf = NULL;
struct btrfs_dev_extent *extent = NULL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
if (ret)
goto out;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
key = found_key;
btrfs_release_path(path);
goto again;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
btrfs_error(root->fs_info, ret, "Slot search failed");
goto out;
}
*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
btrfs: Fix out-of-space bug Btrfs will report NO_SPACE when we create and remove files for several times, and we can't write to filesystem until mount it again. Steps to reproduce: 1: Create a single-dev btrfs fs with default option 2: Write a file into it to take up most fs space 3: Delete above file 4: Wait about 100s to let chunk removed 5: goto 2 Script is like following: #!/bin/bash # Recommend 1.2G space, too large disk will make test slow DEV="/dev/sda16" MNT="/mnt/tmp" dev_size="$(lsblk -bn -o SIZE "$DEV")" || exit 2 file_size_m=$((dev_size * 75 / 100 / 1024 / 1024)) echo "Loop write ${file_size_m}M file on $((dev_size / 1024 / 1024))M dev" for ((i = 0; i < 10; i++)); do umount "$MNT" 2>/dev/null; done echo "mkfs $DEV" mkfs.btrfs -f "$DEV" >/dev/null || exit 2 echo "mount $DEV $MNT" mount "$DEV" "$MNT" || exit 2 for ((loop_i = 0; loop_i < 20; loop_i++)); do echo echo "loop $loop_i" echo "dd file..." cmd=(dd if=/dev/zero of="$MNT"/file0 bs=1M count="$file_size_m") "${cmd[@]}" 2>/dev/null || { # NO_SPACE error triggered echo "dd failed: ${cmd[*]}" exit 1 } echo "rm file..." rm -f "$MNT"/file0 || exit 2 for ((i = 0; i < 10; i++)); do df "$MNT" | tail -1 sleep 10 done done Reason: It is triggered by commit: 47ab2a6c689913db23ccae38349714edf8365e0a which is used to remove empty block groups automatically, but the reason is not in that patch. Code before works well because btrfs don't need to create and delete chunks so many times with high complexity. Above bug is caused by many reason, any of them can trigger it. Reason1: When we remove some continuous chunks but leave other chunks after, these disk space should be used by chunk-recreating, but in current code, only first create will successed. Fixed by Forrest Liu <forrestl@synology.com> in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason2: contains_pending_extent() return wrong value in calculation. Fixed by Forrest Liu <forrestl@synology.com> in: Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole Reason3: btrfs_check_data_free_space() try to commit transaction and retry allocating chunk when the first allocating failed, but space_info->full is set in first allocating, and prevent second allocating in retry. Fixed in this patch by clear space_info->full in commit transaction. Tested for severial times by above script. Changelog v3->v4: use light weight int instead of atomic_t to record have_remove_bgs in transaction, suggested by: Josef Bacik <jbacik@fb.com> Changelog v2->v3: v2 fixed the bug by adding more commit-transaction, but we only need to reclaim space when we are really have no space for new chunk, noticed by: Filipe David Manana <fdmanana@gmail.com> Actually, our code already have this type of commit-and-retry, we only need to make it working with removed-bgs. v3 fixed the bug with above way. Changelog v1->v2: v1 will introduce a new bug when delete and create chunk in same disk space in same transaction, noticed by: Filipe David Manana <fdmanana@gmail.com> V2 fix this bug by commit transaction after remove block grops. Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com> Suggested-by: Filipe David Manana <fdmanana@gmail.com> Suggested-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-02-12 06:18:17 +00:00
} else {
trans->transaction->have_free_bgs = 1;
}
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 start, u64 num_bytes)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
WARN_ON(!device->in_fs_metadata);
WARN_ON(device->is_tgtdev_for_dev_replace);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = device->devid;
key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
if (ret)
goto out;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
n = rb_last(&em_tree->map);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
}
read_unlock(&em_tree->lock);
return ret;
}
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
u64 *devid_ret)
{
int ret;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(fs_info->chunk_root, path,
BTRFS_DEV_ITEMS_OBJECTID,
BTRFS_DEV_ITEM_KEY);
if (ret) {
*devid_ret = 1;
} else {
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
*devid_ret = found_key.offset + 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
/*
* the device information is stored in the chunk root
* the btrfs_device struct should be fully filled in
*/
static int btrfs_add_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*dev_item));
if (ret)
goto out;
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_generation(leaf, dev_item, 0);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_set_device_group(leaf, dev_item, 0);
btrfs_set_device_seek_speed(leaf, dev_item, 0);
btrfs_set_device_bandwidth(leaf, dev_item, 0);
btrfs_set_device_start_offset(leaf, dev_item, 0);
ptr = btrfs_device_uuid(dev_item);
write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
ptr = btrfs_device_fsid(dev_item);
write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
btrfs_mark_buffer_dirty(leaf);
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
/*
* Function to update ctime/mtime for a given device path.
* Mainly used for ctime/mtime based probe like libblkid.
*/
static void update_dev_time(char *path_name)
{
struct file *filp;
filp = filp_open(path_name, O_RDWR, 0);
if (IS_ERR(filp))
return;
file_update_time(filp);
filp_close(filp, NULL);
return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_trans_handle *trans;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret)
goto out;
out:
btrfs_free_path(path);
btrfs_commit_transaction(trans, root);
return ret;
}
int btrfs_rm_device(struct btrfs_root *root, char *device_path)
{
struct btrfs_device *device;
struct btrfs_device *next_device;
struct block_device *bdev;
struct buffer_head *bh = NULL;
struct btrfs_super_block *disk_super;
struct btrfs_fs_devices *cur_devices;
u64 all_avail;
u64 devid;
u64 num_devices;
u8 *dev_uuid;
unsigned seq;
int ret = 0;
bool clear_super = false;
mutex_lock(&uuid_mutex);
do {
seq = read_seqbegin(&root->fs_info->profiles_lock);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
root->fs_info->avail_metadata_alloc_bits;
} while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
root->fs_info->fs_devices->rw_devices <= 2) {
ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
goto out;
}
if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
root->fs_info->fs_devices->rw_devices <= 3) {
ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
goto out;
}
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
device = NULL;
devices = &root->fs_info->fs_devices->devices;
/*
* It is safe to read the devices since the volume_mutex
* is held.
*/
list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata &&
!tmp->is_tgtdev_for_dev_replace &&
!tmp->bdev) {
device = tmp;
break;
}
}
bdev = NULL;
bh = NULL;
disk_super = NULL;
if (!device) {
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
goto out;
}
} else {
ret = btrfs_get_bdev_and_sb(device_path,
FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder, 0,
&bdev, &bh);
if (ret)
goto out;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
device = btrfs_find_device(root->fs_info, devid, dev_uuid,
disk_super->fsid);
if (!device) {
ret = -ENOENT;
goto error_brelse;
}
}
if (device->is_tgtdev_for_dev_replace) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
goto error_brelse;
}
if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
goto error_brelse;
}
if (device->writeable) {
lock_chunks(root);
list_del_init(&device->dev_alloc_list);
device->fs_devices->rw_devices--;
unlock_chunks(root);
clear_super = true;
}
mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
/*
* TODO: the superblock still includes this device in its num_devices
* counter although write_all_supers() is not locked out. This
* could give a filesystem state which requires a degraded mount.
*/
ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
if (ret)
goto error_undo;
device->in_fs_metadata = 0;
btrfs_scrub_cancel_dev(root->fs_info, device);
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
Btrfs: fix race between removing a dev and writing sbs This change fixes an issue when removing a device and writing all super blocks run simultaneously. Here's the steps necessary for the issue to happen: 1) disk-io.c:write_all_supers() gets a number of N devices from the super_copy, so it will not panic if it fails to write super blocks for N - 1 devices; 2) Then it tries to acquire the device_list_mutex, but blocks because volumes.c:btrfs_rm_device() got it first; 3) btrfs_rm_device() removes the device from the list, then unlocks the mutex and after the unlock it updates the number of devices in super_copy to N - 1. 4) write_all_supers() finally acquires the mutex, iterates over all the devices in the list and gets N - 1 errors, that is, it failed to write super blocks to all the devices; 5) Because write_all_supers() thinks there are a total of N devices, it considers N - 1 errors to be ok, and therefore won't panic. So this change just makes sure that write_all_supers() reads the number of devices from super_copy after it acquires the device_list_mutex. Conversely, it changes btrfs_rm_device() to update the number of devices in super_copy before it releases the device list mutex. The code path to add a new device (volumes.c:btrfs_init_new_device), already has the right behaviour: it updates the number of devices in super_copy while holding the device_list_mutex. The only code path that doesn't lock the device list mutex before updating the number of devices in the super copy is disk-io.c:next_root_backup(), called by open_ctree() during mount time where concurrency issues can't happen. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-09 14:41:36 +00:00
* the device supers. Whoever is writing all supers, should
* lock the device list mutex before getting the number of
* devices in the super block (super_copy). Conversely,
* whoever updates the number of devices in the super block
* (super_copy) should hold the device list mutex.
*/
cur_devices = device->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_del_rcu(&device->dev_list);
device->fs_devices->num_devices--;
device->fs_devices->total_devices--;
if (device->missing)
device->fs_devices->missing_devices--;
next_device = list_entry(root->fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
if (device->bdev == root->fs_info->sb->s_bdev)
root->fs_info->sb->s_bdev = next_device->bdev;
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
btrfs_kobj_rm_device(root->fs_info, device);
}
call_rcu(&device->rcu, free_device);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
Btrfs: fix race between removing a dev and writing sbs This change fixes an issue when removing a device and writing all super blocks run simultaneously. Here's the steps necessary for the issue to happen: 1) disk-io.c:write_all_supers() gets a number of N devices from the super_copy, so it will not panic if it fails to write super blocks for N - 1 devices; 2) Then it tries to acquire the device_list_mutex, but blocks because volumes.c:btrfs_rm_device() got it first; 3) btrfs_rm_device() removes the device from the list, then unlocks the mutex and after the unlock it updates the number of devices in super_copy to N - 1. 4) write_all_supers() finally acquires the mutex, iterates over all the devices in the list and gets N - 1 errors, that is, it failed to write super blocks to all the devices; 5) Because write_all_supers() thinks there are a total of N devices, it considers N - 1 errors to be ok, and therefore won't panic. So this change just makes sure that write_all_supers() reads the number of devices from super_copy after it acquires the device_list_mutex. Conversely, it changes btrfs_rm_device() to update the number of devices in super_copy before it releases the device list mutex. The code path to add a new device (volumes.c:btrfs_init_new_device), already has the right behaviour: it updates the number of devices in super_copy while holding the device_list_mutex. The only code path that doesn't lock the device list mutex before updating the number of devices in the super copy is disk-io.c:next_root_backup(), called by open_ctree() during mount time where concurrency issues can't happen. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-09 14:41:36 +00:00
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
if (fs_devices->seed == cur_devices) {
fs_devices->seed = cur_devices->seed;
break;
}
fs_devices = fs_devices->seed;
}
cur_devices->seed = NULL;
__btrfs_close_devices(cur_devices);
free_fs_devices(cur_devices);
}
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
/*
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
if (clear_super && disk_super) {
u64 bytenr;
int i;
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
/* clear the mirror copies of super block on the disk
* being removed, 0th copy is been taken care above and
* the below would take of the rest
*/
for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
if (bytenr + BTRFS_SUPER_INFO_SIZE >=
i_size_read(bdev->bd_inode))
break;
brelse(bh);
bh = __bread(bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh)
continue;
disk_super = (struct btrfs_super_block *)bh->b_data;
if (btrfs_super_bytenr(disk_super) != bytenr ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
continue;
}
memset(&disk_super->magic, 0,
sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
}
}
ret = 0;
if (bdev) {
/* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
update_dev_time(device_path);
}
error_brelse:
brelse(bh);
if (bdev)
block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Neil Brown <neilb@suse.de> Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Acked-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Philipp Reisner <philipp.reisner@linbit.com> Cc: Peter Osterlund <petero2@telia.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jan Kara <jack@suse.cz> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <joel.becker@oracle.com> Cc: Alex Elder <aelder@sgi.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen <leochen@broadcom.com> Cc: Scott Branden <sbranden@broadcom.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Cc: Joern Engel <joern@logfs.org> Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 10:55:17 +00:00
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
if (device->writeable) {
lock_chunks(root);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
device->fs_devices->rw_devices++;
unlock_chunks(root);
}
goto error_brelse;
}
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 08:52:31 +00:00
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices;
WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
/*
* in case of fs with no seed, srcdev->fs_devices will point
* to fs_devices of fs_info. However when the dev being replaced is
* a seed dev it will point to the seed's local fs_devices. In short
* srcdev will have its correct fs_devices in both the cases.
*/
fs_devices = srcdev->fs_devices;
list_del_rcu(&srcdev->dev_list);
list_del_rcu(&srcdev->dev_alloc_list);
fs_devices->num_devices--;
if (srcdev->missing)
fs_devices->missing_devices--;
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
btrfs_scratch_superblock(srcdev);
}
if (srcdev->bdev)
fs_devices->open_devices--;
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 08:52:31 +00:00
}
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev)
{
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
call_rcu(&srcdev->rcu, free_device);
/*
* unless fs_devices is seed fs, num_devices shouldn't go
* zero
*/
BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
struct btrfs_fs_devices *tmp_fs_devices;
tmp_fs_devices = fs_info->fs_devices;
while (tmp_fs_devices) {
if (tmp_fs_devices->seed == fs_devices) {
tmp_fs_devices->seed = fs_devices->seed;
break;
}
tmp_fs_devices = tmp_fs_devices->seed;
}
fs_devices->seed = NULL;
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
}
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
struct btrfs_device *next_device;
mutex_lock(&uuid_mutex);
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
if (tgtdev->bdev) {
btrfs_scratch_superblock(tgtdev);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
next_device = list_entry(fs_info->fs_devices->devices.next,
struct btrfs_device, dev_list);
if (tgtdev->bdev == fs_info->sb->s_bdev)
fs_info->sb->s_bdev = next_device->bdev;
if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
fs_info->fs_devices->latest_bdev = next_device->bdev;
list_del_rcu(&tgtdev->dev_list);
call_rcu(&tgtdev->rcu, free_device);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
}
static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
struct btrfs_device **device)
{
int ret = 0;
struct btrfs_super_block *disk_super;
u64 devid;
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
*device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
root->fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
return ret;
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
disk_super->fsid);
brelse(bh);
if (!*device)
ret = -ENOENT;
blkdev_put(bdev, FMODE_READ);
return ret;
}
int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
char *device_path,
struct btrfs_device **device)
{
*device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
devices = &root->fs_info->fs_devices->devices;
/*
* It is safe to read the devices since the volume_mutex
* is held by the caller.
*/
list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata && !tmp->bdev) {
*device = tmp;
break;
}
}
if (!*device) {
btrfs_err(root->fs_info, "no missing device found");
return -ENOENT;
}
return 0;
} else {
return btrfs_find_device_by_path(root, device_path, device);
}
}
/*
* does all the dirty work required for changing file system's UUID.
*/
static int btrfs_prepare_sprout(struct btrfs_root *root)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
struct btrfs_fs_devices *seed_devices;
struct btrfs_super_block *disk_super = root->fs_info->super_copy;
struct btrfs_device *device;
u64 super_flags;
BUG_ON(!mutex_is_locked(&uuid_mutex));
if (!fs_devices->seeding)
return -EINVAL;
seed_devices = __alloc_fs_devices();
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
return PTR_ERR(old_devices);
}
list_add(&old_devices->list, &fs_uuids);
memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
seed_devices->opened = 1;
INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
synchronize_rcu);
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
lock_chunks(root);
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
unlock_chunks(root);
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
Btrfs: fix race conditions in BTRFS_IOC_FS_INFO ioctl The handler for the ioctl BTRFS_IOC_FS_INFO was reading the number of devices before acquiring the device list mutex. This could lead to inconsistent results because the update of the device list and the number of devices counter (amongst other counters related to the device list) are updated in volumes.c while holding the device list mutex - except for 2 places, one was volumes.c:btrfs_prepare_sprout() and the other was volumes.c:device_list_add(). For example, if we have 2 devices, with IDs 1 and 2 and then add a new device, with ID 3, and while adding the device is in progress an BTRFS_IOC_FS_INFO ioctl arrives, it could return a number of devices of 2 and a max dev id of 3. This would be incorrect. Also, this ioctl handler was reading the fsid while it can be updated concurrently. This can happen when while a new device is being added and the current filesystem is in seeding mode. Example: $ mkfs.btrfs -f /dev/sdb1 $ mkfs.btrfs -f /dev/sdb2 $ btrfstune -S 1 /dev/sdb1 $ mount /dev/sdb1 /mnt/test $ btrfs device add /dev/sdb2 /mnt/test If during the last step a BTRFS_IOC_FS_INFO ioctl was requested, it could read an fsid that was never valid (some bits part of the old fsid and others part of the new fsid). Also, it could read a number of devices that doesn't match the number of devices in the list and the max device id, as explained before. Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-12 19:56:58 +00:00
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
super_flags = btrfs_super_flags(disk_super) &
~BTRFS_SUPER_FLAG_SEEDING;
btrfs_set_super_flags(disk_super, super_flags);
return 0;
}
/*
* strore the expected generation for seed devices in device items.
*/
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dev_item *dev_item;
struct btrfs_device *device;
struct btrfs_key key;
u8 fs_uuid[BTRFS_UUID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
u64 devid;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
root = root->fs_info->chunk_root;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto error;
leaf = path->nodes[0];
next_slot:
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret > 0)
break;
if (ret < 0)
goto error;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(path);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
key.type != BTRFS_DEV_ITEM_KEY)
break;
dev_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_item);
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
device = btrfs_find_device(root->fs_info, devid, dev_uuid,
fs_uuid);
BUG_ON(!device); /* Logic error */
if (device->fs_devices->seeding) {
btrfs_set_device_generation(leaf, dev_item,
device->generation);
btrfs_mark_buffer_dirty(leaf);
}
path->slots[0]++;
goto next_slot;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
{
struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
struct super_block *sb = root->fs_info->sb;
struct rcu_string *name;
u64 tmp;
int seeding_dev = 0;
int ret = 0;
if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
return -EROFS;
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (root->fs_info->fs_devices->seeding) {
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &root->fs_info->fs_devices->devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
goto error;
}
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
device = btrfs_alloc_device(root->fs_info, NULL, NULL);
if (IS_ERR(device)) {
/* we can safely leave the fs_devices entry around */
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_NOFS);
if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
rcu_string_free(device->name);
kfree(device);
ret = PTR_ERR(trans);
goto error;
}
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
device->writeable = 1;
device->generation = trans->transid;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
device->total_bytes = i_size_read(bdev->bd_inode);
device->disk_total_bytes = device->total_bytes;
device->commit_total_bytes = device->total_bytes;
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 0;
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
ret = btrfs_prepare_sprout(root);
BUG_ON(ret); /* -ENOMEM */
}
device->fs_devices = root->fs_info->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
lock_chunks(root);
list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
root->fs_info->fs_devices->num_devices++;
root->fs_info->fs_devices->open_devices++;
root->fs_info->fs_devices->rw_devices++;
root->fs_info->fs_devices->total_devices++;
root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes;
spin_unlock(&root->fs_info->free_chunk_lock);
if (!blk_queue_nonrot(bdev_get_queue(bdev)))
root->fs_info->fs_devices->rotating = 1;
tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
btrfs_set_super_total_bytes(root->fs_info->super_copy,
tmp + device->total_bytes);
tmp = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
tmp + 1);
/* add sysfs device entry */
btrfs_kobj_add_device(root->fs_info, device);
/*
* we've got more storage, clear any full flags on the space
* infos
*/
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
lock_chunks(root);
ret = init_first_rw_device(trans, root, device);
unlock_chunks(root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
}
ret = btrfs_add_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
if (seeding_dev) {
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = btrfs_finish_sprout(trans, root);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
/* Sprouting would change fsid of the mounted root,
* so rename the fsid on the sysfs
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
fsid_buf))
goto error_trans;
}
root->fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
ret = btrfs_commit_transaction(trans, root);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
if (ret) /* transaction commit */
return ret;
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
btrfs_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
trans = btrfs_attach_transaction(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) == -ENOENT)
return 0;
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans, root);
}
/* Update ctime/mtime for libblkid */
update_dev_time(device_path);
return ret;
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Neil Brown <neilb@suse.de> Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Acked-by: Mike Snitzer <snitzer@redhat.com> Acked-by: Philipp Reisner <philipp.reisner@linbit.com> Cc: Peter Osterlund <petero2@telia.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Jan Kara <jack@suse.cz> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <joel.becker@oracle.com> Cc: Alex Elder <aelder@sgi.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen <leochen@broadcom.com> Cc: Scott Branden <sbranden@broadcom.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Cc: Joern Engel <joern@logfs.org> Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro <viro@zeniv.linux.org.uk>
2010-11-13 10:55:17 +00:00
blkdev_put(bdev, FMODE_EXCL);
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
return ret;
}
int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct request_queue *q;
struct btrfs_device *device;
struct block_device *bdev;
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
if (fs_info->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info, "target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
if (i_size_read(bdev->bd_inode) <
btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info, "target device is smaller than source device!");
ret = -EINVAL;
goto error;
}
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_NOFS);
if (!name) {
kfree(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
q = bdev_get_queue(bdev);
if (blk_queue_discard(q))
device->can_discard = 1;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
device->writeable = 1;
device->generation = 0;
device->io_width = root->sectorsize;
device->io_align = root->sectorsize;
device->sector_size = root->sectorsize;
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
ASSERT(list_empty(&srcdev->resized_list));
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->dev_root = fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 1;
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
device->fs_devices = fs_info->fs_devices;
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
*device_out = device;
return ret;
error:
blkdev_put(bdev, FMODE_EXCL);
return ret;
}
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev)
{
WARN_ON(fs_info->fs_devices->rw_devices == 0);
tgtdev->io_width = fs_info->dev_root->sectorsize;
tgtdev->io_align = fs_info->dev_root->sectorsize;
tgtdev->sector_size = fs_info->dev_root->sectorsize;
tgtdev->dev_root = fs_info->dev_root;
tgtdev->in_fs_metadata = 1;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
int ret;
struct btrfs_path *path;
struct btrfs_root *root;
struct btrfs_dev_item *dev_item;
struct extent_buffer *leaf;
struct btrfs_key key;
root = device->dev_root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
btrfs_set_device_id(leaf, dev_item, device->devid);
btrfs_set_device_type(leaf, dev_item, device->type);
btrfs_set_device_io_align(leaf, dev_item, device->io_align);
btrfs_set_device_io_width(leaf, dev_item, device->io_width);
btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
btrfs_set_device_total_bytes(leaf, dev_item,
btrfs_device_get_disk_total_bytes(device));
btrfs_set_device_bytes_used(leaf, dev_item,
btrfs_device_get_bytes_used(device));
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size)
{
struct btrfs_super_block *super_copy =
device->dev_root->fs_info->super_copy;
struct btrfs_fs_devices *fs_devices;
u64 old_total;
u64 diff;
if (!device->writeable)
return -EACCES;
lock_chunks(device->dev_root);
old_total = btrfs_super_total_bytes(super_copy);
diff = new_size - device->total_bytes;
if (new_size <= device->total_bytes ||
device->is_tgtdev_for_dev_replace) {
unlock_chunks(device->dev_root);
return -EINVAL;
}
fs_devices = device->dev_root->fs_info->fs_devices;
btrfs_set_super_total_bytes(super_copy, old_total + diff);
device->fs_devices->total_rw_bytes += diff;
btrfs_device_set_total_bytes(device, new_size);
btrfs_device_set_disk_total_bytes(device, new_size);
btrfs_clear_space_info_full(device->dev_root->fs_info);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
&fs_devices->resized_devices);
unlock_chunks(device->dev_root);
return btrfs_update_device(trans, device);
}
static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 chunk_objectid,
u64 chunk_offset)
{
int ret;
struct btrfs_path *path;
struct btrfs_key key;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = chunk_objectid;
key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
btrfs_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
btrfs_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
chunk_offset)
{
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *ptr;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur;
struct btrfs_key key;
lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
ptr = super_copy->sys_chunk_array;
cur = 0;
while (cur < array_size) {
disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
len = sizeof(*disk_key);
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)(ptr + len);
num_stripes = btrfs_stack_chunk_num_stripes(chunk);
len += btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
if (key.objectid == chunk_objectid &&
key.offset == chunk_offset) {
memmove(ptr, ptr + len, array_size - (cur + len));
array_size -= len;
btrfs_set_super_sys_array_size(super_copy, array_size);
} else {
ptr += len;
cur += len;
}
}
unlock_chunks(root);
return ret;
}
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
int i, ret = 0;
/* Just in case */
root = root->fs_info->chunk_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
if (!em || em->start > chunk_offset ||
em->start + em->len < chunk_offset) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doens't
* do anything we still error out.
*/
ASSERT(0);
if (em)
free_extent_map(em);
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
if (device->bytes_used > 0) {
lock_chunks(root);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += dev_extent_len;
spin_unlock(&root->fs_info->free_chunk_lock);
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
}
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
}
}
ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
}
Btrfs: fix race between fs trimming and block group remove/allocation Our fs trim operation, which is completely transactionless (doesn't start or joins an existing transaction) consists of visiting all block groups and then for each one to iterate its free space entries and perform a discard operation against the space range represented by the free space entries. However before performing a discard, the corresponding free space entry is removed from the free space rbtree, and when the discard completes it is added back to the free space rbtree. If a block group remove operation happens while the discard is ongoing (or before it starts and after a free space entry is hidden), we end up not waiting for the discard to complete, remove the extent map that maps logical address to physical addresses and the corresponding chunk metadata from the the chunk and device trees. After that and before the discard completes, the current running transaction can finish and a new one start, allowing for new block groups that map to the same physical addresses to be allocated and written to. So fix this by keeping the extent map in memory until the discard completes so that the same physical addresses aren't reused before it completes. If the physical locations that are under a discard operation end up being used for a new metadata block group for example, and dirty metadata extents are written before the discard finishes (the VM might call writepages() of our btree inode's i_mapping for example, or an fsync log commit happens) we end up overwriting metadata with zeroes, which leads to errors from fsck like the following: checking extents Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block owner ref check failed [833912832 16384] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 Check tree block failed, want=833912832, have=0 read block failed check_tree_block root 5 root dir 256 error root 5 inode 260 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref root 5 inode 262 errors 2001, no inode item, link count wrong unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref root 5 inode 263 errors 2001, no inode item, link count wrong (...) Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-11-27 21:14:15 +00:00
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
}
out:
/* once for us */
free_extent_map(em);
return ret;
}
static int btrfs_relocate_chunk(struct btrfs_root *root,
u64 chunk_objectid,
u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
int ret;
root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root;
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
if (ret)
return ret;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_std_error(root->fs_info, ret);
return ret;
}
/*
* step two, delete the device extents and the
* chunk tree entries
*/
ret = btrfs_remove_chunk(trans, root, chunk_offset);
btrfs_end_transaction(trans, root);
return ret;
}
static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
{
struct btrfs_root *chunk_root = root->fs_info->chunk_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_chunk *chunk;
struct btrfs_key key;
struct btrfs_key found_key;
u64 chunk_type;
bool retried = false;
int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret < 0)
goto error;
if (ret > 0)
break;
2008-09-26 14:09:34 +00:00
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2008-09-26 14:09:34 +00:00
chunk = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root,
found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
failed++;
else
BUG_ON(ret);
}
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
ret = 0;
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
error:
btrfs_free_path(path);
return ret;
}
static int insert_balance_item(struct btrfs_root *root,
struct btrfs_balance_control *bctl)
{
struct btrfs_trans_handle *trans;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*item));
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
btrfs_set_balance_data(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
btrfs_set_balance_meta(leaf, item, &disk_bargs);
btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
btrfs_set_balance_sys(leaf, item, &disk_bargs);
btrfs_set_balance_flags(leaf, item, bctl->flags);
btrfs_mark_buffer_dirty(leaf);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
return ret;
}
static int del_balance_item(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key key;
int ret, err;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
}
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
if (ret > 0) {
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
btrfs_free_path(path);
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
return ret;
}
/*
* This is a heuristic used to reduce the number of chunks balanced on
* resume after balance was interrupted.
*/
static void update_balance_args(struct btrfs_balance_control *bctl)
{
/*
* Turn on soft mode for chunk types that were being converted.
*/
if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
/*
* Turn on usage filter if is not already used. The idea is
* that chunks that we have already balanced should be
* reasonably full. Don't do it for chunks that are being
* converted - that will keep us from relocating unconverted
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
}
}
/*
* Should be called with both balance and volume mutexes held to
* serialize other volume operations (add_dev/rm_dev/resize) with
* restriper. Same goes for unset_balance_control.
*/
static void set_balance_control(struct btrfs_balance_control *bctl)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
BUG_ON(fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = bctl;
spin_unlock(&fs_info->balance_lock);
}
static void unset_balance_control(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
BUG_ON(!fs_info->balance_ctl);
spin_lock(&fs_info->balance_lock);
fs_info->balance_ctl = NULL;
spin_unlock(&fs_info->balance_lock);
kfree(bctl);
}
/*
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
static int chunk_profiles_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
return 0;
return 1;
}
static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
if (bargs->usage == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
else
user_thresh = div_factor_fine(cache->key.offset,
bargs->usage);
if (chunk_used < user_thresh)
ret = 0;
btrfs_put_block_group(cache);
return ret;
}
static int chunk_devid_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
int i;
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
return 0;
}
return 1;
}
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
u64 stripe_offset;
u64 stripe_length;
int factor;
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
return 0;
if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
factor = num_stripes / 2;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
factor = num_stripes - 1;
} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
factor = num_stripes - 2;
} else {
factor = num_stripes;
}
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
continue;
stripe_offset = btrfs_stripe_offset(leaf, stripe);
stripe_length = btrfs_chunk_length(leaf, chunk);
stripe_length = div_u64(stripe_length, factor);
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
return 0;
}
return 1;
}
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
struct btrfs_chunk *chunk,
u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
return 0;
return 1;
}
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return 0;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
return 1;
return 0;
}
static int should_balance_chunk(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 chunk_offset)
{
struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
return 0;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
bargs = &bctl->data;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
bargs = &bctl->sys;
else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
bargs = &bctl->meta;
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
return 0;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
return 0;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
return 0;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
return 0;
}
/*
* limited by count, must be the last filter
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
return 0;
else
bargs->limit--;
}
return 1;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
struct btrfs_root *chunk_root = fs_info->chunk_root;
struct btrfs_root *dev_root = fs_info->dev_root;
struct list_head *devices;
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_trans_handle *trans;
struct extent_buffer *leaf;
int slot;
int ret;
int enospc_errors = 0;
bool counting = true;
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
if (!device->writeable ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
device->is_tgtdev_for_dev_replace)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
if (ret == -ENOSPC)
break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 0);
BUG_ON(IS_ERR(trans));
ret = btrfs_grow_device(trans, device, old_size);
BUG_ON(ret);
btrfs_end_transaction(trans, dev_root);
}
/* step two, relocate all the chunks */
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto error;
}
/* zero out stat counters */
spin_lock(&fs_info->balance_lock);
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
atomic_read(&fs_info->balance_cancel_req)) {
ret = -ECANCELED;
goto error;
}
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
/*
* this shouldn't happen, it means the last relocate
* failed
*/
if (ret == 0)
BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
ret = 0;
break;
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid)
break;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.considered++;
spin_unlock(&fs_info->balance_lock);
}
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
btrfs_release_path(path);
if (!ret)
goto loop;
if (counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
goto loop;
}
ret = btrfs_relocate_chunk(chunk_root,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
goto error;
if (ret == -ENOSPC) {
enospc_errors++;
} else {
spin_lock(&fs_info->balance_lock);
bctl->stat.completed++;
spin_unlock(&fs_info->balance_lock);
}
loop:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
}
if (counting) {
btrfs_release_path(path);
counting = false;
goto again;
}
error:
btrfs_free_path(path);
if (enospc_errors) {
btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
if (!ret)
ret = -ENOSPC;
}
return ret;
}
/**
* alloc_profile_is_valid - see if a given profile is valid and reduced
* @flags: profile to validate
* @extended: if true @flags is treated as an extended profile
*/
static int alloc_profile_is_valid(u64 flags, int extended)
{
u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
BTRFS_BLOCK_GROUP_PROFILE_MASK);
flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
/* 1) check that all other bits are zeroed */
if (flags & ~mask)
return 0;
/* 2) see if profile is reduced */
if (flags == 0)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
return (flags & (flags - 1)) == 0;
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
{
/* cancel requested || normal exit path */
return atomic_read(&fs_info->balance_cancel_req) ||
(atomic_read(&fs_info->balance_pause_req) == 0 &&
atomic_read(&fs_info->balance_cancel_req) == 0);
}
static void __cancel_balance(struct btrfs_fs_info *fs_info)
{
int ret;
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
btrfs_std_error(fs_info, ret);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
/*
* Should be called with both balance and volume mutexes held
*/
int btrfs_balance(struct btrfs_balance_control *bctl,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_fs_info *fs_info = bctl->fs_info;
u64 allowed;
int mixed = 0;
int ret;
u64 num_devices;
unsigned seq;
if (btrfs_fs_closing(fs_info) ||
atomic_read(&fs_info->balance_pause_req) ||
atomic_read(&fs_info->balance_cancel_req)) {
ret = -EINVAL;
goto out;
}
allowed = btrfs_super_incompat_flags(fs_info->super_copy);
if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
mixed = 1;
/*
* In case of mixed groups both data and meta should be picked,
* and identical options should be given for both of them.
*/
allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
if (mixed && (bctl->flags & allowed)) {
if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
btrfs_err(fs_info, "with mixed groups data and "
"metadata balance options must be the same");
ret = -EINVAL;
goto out;
}
}
num_devices = fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
else if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
if (num_devices > 2)
allowed |= BTRFS_BLOCK_GROUP_RAID5;
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
(bctl->data.target & ~allowed))) {
btrfs_err(fs_info, "unable to start balance with target "
"data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
}
if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->meta.target, 1) ||
(bctl->meta.target & ~allowed))) {
btrfs_err(fs_info,
"unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
}
if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->sys.target, 1) ||
(bctl->sys.target & ~allowed))) {
btrfs_err(fs_info,
"unable to start balance with target system profile %llu",
bctl->sys.target);
ret = -EINVAL;
goto out;
}
/* allow dup'ed data chunks only in mixed mode */
if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
btrfs_err(fs_info, "dup for data is not allowed");
ret = -EINVAL;
goto out;
}
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_RAID6;
do {
seq = read_seqbegin(&fs_info->profiles_lock);
if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_system_alloc_bits & allowed) &&
!(bctl->sys.target & allowed)) ||
((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed))) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
btrfs_info(fs_info, "force reducing metadata integrity");
} else {
btrfs_err(fs_info, "balance will reduce metadata "
"integrity, use force if you want this");
ret = -EINVAL;
goto out;
}
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
int num_tolerated_disk_barrier_failures;
u64 target = bctl->sys.target;
num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
if (num_tolerated_disk_barrier_failures > 0 &&
(target &
(BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
num_tolerated_disk_barrier_failures = 0;
else if (num_tolerated_disk_barrier_failures > 1 &&
(target &
(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
num_tolerated_disk_barrier_failures = 1;
fs_info->num_tolerated_disk_barrier_failures =
num_tolerated_disk_barrier_failures;
}
ret = insert_balance_item(fs_info->tree_root, bctl);
if (ret && ret != -EEXIST)
goto out;
if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
BUG_ON(ret == -EEXIST);
set_balance_control(bctl);
} else {
BUG_ON(ret != -EEXIST);
spin_lock(&fs_info->balance_lock);
update_balance_args(bctl);
spin_unlock(&fs_info->balance_lock);
}
atomic_inc(&fs_info->balance_running);
mutex_unlock(&fs_info->balance_mutex);
ret = __btrfs_balance(fs_info);
mutex_lock(&fs_info->balance_mutex);
atomic_dec(&fs_info->balance_running);
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
}
if (bargs) {
memset(bargs, 0, sizeof(*bargs));
update_ioctl_balance_args(fs_info, 0, bargs);
}
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
__cancel_balance(fs_info);
}
wake_up(&fs_info->balance_wait_q);
return ret;
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
__cancel_balance(fs_info);
else {
kfree(bctl);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
return ret;
}
static int balance_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
int ret = 0;
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
btrfs_info(fs_info, "continuing balance");
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
return ret;
}
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *tsk;
spin_lock(&fs_info->balance_lock);
if (!fs_info->balance_ctl) {
spin_unlock(&fs_info->balance_lock);
return 0;
}
spin_unlock(&fs_info->balance_lock);
if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
btrfs_info(fs_info, "force skipping balance");
return 0;
}
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
return PTR_ERR_OR_ZERO(tsk);
}
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl;
struct btrfs_balance_item *item;
struct btrfs_disk_balance_args disk_bargs;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_BALANCE_OBJECTID;
key.type = BTRFS_BALANCE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) { /* ret = -ENOENT; */
ret = 0;
goto out;
}
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
ret = -ENOMEM;
goto out;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
bctl->fs_info = fs_info;
bctl->flags = btrfs_balance_flags(leaf, item);
bctl->flags |= BTRFS_BALANCE_RESUME;
btrfs_balance_data(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
btrfs_balance_meta(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
set_balance_control(bctl);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
out:
btrfs_free_path(path);
return ret;
}
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
int ret = 0;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
if (atomic_read(&fs_info->balance_running)) {
atomic_inc(&fs_info->balance_pause_req);
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
atomic_read(&fs_info->balance_running) == 0);
mutex_lock(&fs_info->balance_mutex);
/* we are good with balance_ctl ripped off from under us */
BUG_ON(atomic_read(&fs_info->balance_running));
atomic_dec(&fs_info->balance_pause_req);
} else {
ret = -ENOTCONN;
}
mutex_unlock(&fs_info->balance_mutex);
return ret;
}
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
if (fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
return -ENOTCONN;
}
atomic_inc(&fs_info->balance_cancel_req);
/*
* if we are running just wait and return, balance item is
* deleted in btrfs_balance in this case
*/
if (atomic_read(&fs_info->balance_running)) {
mutex_unlock(&fs_info->balance_mutex);
wait_event(fs_info->balance_wait_q,
atomic_read(&fs_info->balance_running) == 0);
mutex_lock(&fs_info->balance_mutex);
} else {
/* __cancel_balance needs volume_mutex */
mutex_unlock(&fs_info->balance_mutex);
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
__cancel_balance(fs_info);
mutex_unlock(&fs_info->volume_mutex);
}
BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
}
static int btrfs_uuid_scan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_key key;
struct btrfs_key max_key;
struct btrfs_path *path = NULL;
int ret = 0;
struct extent_buffer *eb;
int slot;
struct btrfs_root_item root_item;
u32 item_size;
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
struct btrfs_trans_handle *trans = NULL;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = 0;
max_key.objectid = (u64)-1;
max_key.type = BTRFS_ROOT_ITEM_KEY;
max_key.offset = (u64)-1;
while (1) {
ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
ret = 0;
break;
}
if (key.type != BTRFS_ROOT_ITEM_KEY ||
(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.objectid != BTRFS_FS_TREE_OBJECTID) ||
key.objectid > BTRFS_LAST_FREE_OBJECTID)
goto skip;
eb = path->nodes[0];
slot = path->slots[0];
item_size = btrfs_item_size_nr(eb, slot);
if (item_size < sizeof(root_item))
goto skip;
read_extent_buffer(eb, &root_item,
btrfs_item_ptr_offset(eb, slot),
(int)sizeof(root_item));
if (btrfs_root_refs(&root_item) == 0)
goto skip;
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
if (!btrfs_is_empty_uuid(root_item.uuid) ||
!btrfs_is_empty_uuid(root_item.received_uuid)) {
if (trans)
goto update_tree;
btrfs_release_path(path);
/*
* 1 - subvol uuid item
* 1 - received_subvol uuid item
*/
trans = btrfs_start_transaction(fs_info->uuid_root, 2);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
break;
}
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
continue;
} else {
goto skip;
}
update_tree:
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
root_item.received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
if (ret < 0) {
btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
}
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
skip:
if (trans) {
ret = btrfs_end_transaction(trans, fs_info->uuid_root);
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
trans = NULL;
if (ret)
break;
}
btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
} else if (key.objectid < (u64)-1) {
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
key.objectid++;
} else {
break;
}
cond_resched();
}
out:
btrfs_free_path(path);
Btrfs: fix deadlock in uuid scan kthread If there's an ongoing transaction when the uuid scan kthread attempts to create one, the kthread will block, waiting for that transaction to finish while it's keeping locks on the tree root, and in turn the existing transaction is waiting for those locks to be free. The stack trace reported by the kernel follows. [36700.671601] INFO: task btrfs-uuid:15480 blocked for more than 120 seconds. [36700.671602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671602] btrfs-uuid D 0000000000000000 0 15480 2 0x00000000 [36700.671604] ffff880710bd5b88 0000000000000046 ffff8803d36ba850 0000000000030000 [36700.671605] ffff8806d76dc530 ffff880710bd5fd8 ffff880710bd5fd8 ffff880710bd5fd8 [36700.671607] ffff8808098ac530 ffff8806d76dc530 ffff880710bd5b98 ffff8805e4508e40 [36700.671608] Call Trace: [36700.671610] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671620] [<ffffffffa05a3bdf>] wait_current_trans.isra.33+0xbf/0x120 [btrfs] [36700.671623] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671629] [<ffffffffa05a5b06>] start_transaction+0x3d6/0x530 [btrfs] [36700.671636] [<ffffffffa05bb1f4>] ? btrfs_get_token_32+0x64/0xf0 [btrfs] [36700.671642] [<ffffffffa05a5fbb>] btrfs_start_transaction+0x1b/0x20 [btrfs] [36700.671649] [<ffffffffa05c8a81>] btrfs_uuid_scan_kthread+0x211/0x3d0 [btrfs] [36700.671655] [<ffffffffa05c8870>] ? __btrfs_open_devices+0x2a0/0x2a0 [btrfs] [36700.671657] [<ffffffff81065fa0>] kthread+0xc0/0xd0 [36700.671659] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671661] [<ffffffff816fcd1c>] ret_from_fork+0x7c/0xb0 [36700.671662] [<ffffffff81065ee0>] ? flush_kthread_worker+0xb0/0xb0 [36700.671663] INFO: task btrfs:15481 blocked for more than 120 seconds. [36700.671664] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [36700.671665] btrfs D 0000000000000000 0 15481 15212 0x00000004 [36700.671666] ffff880248cbf4c8 0000000000000086 ffff8803d36ba700 ffff8801dbd5c280 [36700.671668] ffff880807815c40 ffff880248cbffd8 ffff880248cbffd8 ffff880248cbffd8 [36700.671669] ffff8805e86a0000 ffff880807815c40 ffff880248cbf4d8 ffff8801dbd5c280 [36700.671670] Call Trace: [36700.671672] [<ffffffff816f36b9>] schedule+0x29/0x70 [36700.671679] [<ffffffffa05d9b0d>] btrfs_tree_lock+0x6d/0x230 [btrfs] [36700.671680] [<ffffffff81066760>] ? add_wait_queue+0x60/0x60 [36700.671685] [<ffffffffa0582829>] btrfs_search_slot+0x999/0xb00 [btrfs] [36700.671691] [<ffffffffa05bd9de>] ? btrfs_lookup_first_ordered_extent+0x5e/0xb0 [btrfs] [36700.671698] [<ffffffffa05e3e54>] __btrfs_write_out_cache+0x8c4/0xa80 [btrfs] [36700.671704] [<ffffffffa05e4362>] btrfs_write_out_cache+0xb2/0xf0 [btrfs] [36700.671710] [<ffffffffa05c4441>] ? free_extent_buffer+0x61/0xc0 [btrfs] [36700.671716] [<ffffffffa0594c82>] btrfs_write_dirty_block_groups+0x562/0x650 [btrfs] [36700.671723] [<ffffffffa0610092>] commit_cowonly_roots+0x171/0x24b [btrfs] [36700.671729] [<ffffffffa05a4dde>] btrfs_commit_transaction+0x4fe/0xa10 [btrfs] [36700.671735] [<ffffffffa0610af3>] create_subvol+0x5c0/0x636 [btrfs] [36700.671742] [<ffffffffa05d49ff>] btrfs_mksubvol.isra.60+0x33f/0x3f0 [btrfs] [36700.671747] [<ffffffffa05d4bf2>] btrfs_ioctl_snap_create_transid+0x142/0x190 [btrfs] [36700.671752] [<ffffffffa05d4c6c>] ? btrfs_ioctl_snap_create+0x2c/0x80 [btrfs] [36700.671757] [<ffffffffa05d4c9e>] btrfs_ioctl_snap_create+0x5e/0x80 [btrfs] [36700.671759] [<ffffffff8113a764>] ? handle_pte_fault+0x84/0x920 [36700.671764] [<ffffffffa05d87eb>] btrfs_ioctl+0xf0b/0x1d00 [btrfs] [36700.671766] [<ffffffff8113c120>] ? handle_mm_fault+0x210/0x310 [36700.671768] [<ffffffff816f83a4>] ? __do_page_fault+0x284/0x4e0 [36700.671770] [<ffffffff81180aa6>] do_vfs_ioctl+0x96/0x550 [36700.671772] [<ffffffff81170fe3>] ? __sb_end_write+0x33/0x70 [36700.671774] [<ffffffff81180ff1>] SyS_ioctl+0x91/0xb0 [36700.671775] [<ffffffff816fcdc2>] system_call_fastpath+0x16/0x1b Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-08-28 09:28:34 +00:00
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, fs_info->uuid_root);
if (ret)
btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
else
fs_info->update_uuid_tree_gen = 1;
up(&fs_info->uuid_tree_rescan_sem);
return 0;
}
/*
* Callback for btrfs_uuid_tree_iterate().
* returns:
* 0 check succeeded, the entry is not outdated.
* < 0 if an error occured.
* > 0 if the check failed, which means the caller shall remove the entry.
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
u8 *uuid, u8 type, u64 subid)
{
struct btrfs_key key;
int ret = 0;
struct btrfs_root *subvol_root;
if (type != BTRFS_UUID_KEY_SUBVOL &&
type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
goto out;
key.objectid = subid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(subvol_root)) {
ret = PTR_ERR(subvol_root);
if (ret == -ENOENT)
ret = 1;
goto out;
}
switch (type) {
case BTRFS_UUID_KEY_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
ret = 1;
break;
case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
if (memcmp(uuid, subvol_root->root_item.received_uuid,
BTRFS_UUID_SIZE))
ret = 1;
break;
}
out:
return ret;
}
static int btrfs_uuid_rescan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
int ret;
/*
* 1st step is to iterate through the existing UUID tree and
* to delete all entries that contain outdated data.
* 2nd step is to add all missing entries to the UUID tree.
*/
ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
if (ret < 0) {
btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
up(&fs_info->uuid_tree_rescan_sem);
return ret;
}
return btrfs_uuid_scan_kthread(data);
}
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *tree_root = fs_info->tree_root;
struct btrfs_root *uuid_root;
struct task_struct *task;
int ret;
/*
* 1 - root node
* 1 - root item
*/
trans = btrfs_start_transaction(tree_root, 2);
if (IS_ERR(trans))
return PTR_ERR(trans);
uuid_root = btrfs_create_tree(trans, fs_info,
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
btrfs_abort_transaction(trans, tree_root,
PTR_ERR(uuid_root));
return PTR_ERR(uuid_root);
}
fs_info->uuid_root = uuid_root;
ret = btrfs_commit_transaction(trans, tree_root);
if (ret)
return ret;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_scan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
struct task_struct *task;
down(&fs_info->uuid_tree_rescan_sem);
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
btrfs_warn(fs_info, "failed to start uuid_rescan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
return 0;
}
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
* The chunk relocation code actually frees the device extent
*/
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = device->dev_root;
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
u64 chunk_objectid;
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = btrfs_device_get_total_bytes(device);
u64 diff = old_size - new_size;
if (device->is_tgtdev_for_dev_replace)
return -EINVAL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->reada = 2;
lock_chunks(root);
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space -= diff;
spin_unlock(&root->fs_info->free_chunk_lock);
}
unlock_chunks(root);
again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
do {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto done;
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret < 0)
goto done;
if (ret) {
ret = 0;
btrfs_release_path(path);
break;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
btrfs_release_path(path);
break;
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
btrfs_release_path(path);
break;
}
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
failed++;
} while (key.offset-- > 0);
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
lock_chunks(root);
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += diff;
spin_unlock(&root->fs_info->free_chunk_lock);
unlock_chunks(root);
goto done;
}
/* Shrinking succeeded, else we would be at "done". */
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto done;
}
lock_chunks(root);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->resized_list))
list_add_tail(&device->resized_list,
&root->fs_info->fs_devices->resized_devices);
WARN_ON(diff > old_total);
btrfs_set_super_total_bytes(super_copy, old_total - diff);
unlock_chunks(root);
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
btrfs_end_transaction(trans, root);
done:
btrfs_free_path(path);
return ret;
}
static int btrfs_add_system_chunk(struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
lock_chunks(root);
array_size = btrfs_super_sys_array_size(super_copy);
if (array_size + item_size + sizeof(disk_key)
> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
unlock_chunks(root);
return -EFBIG;
}
ptr = super_copy->sys_chunk_array + array_size;
btrfs_cpu_key_to_disk(&disk_key, key);
memcpy(ptr, &disk_key, sizeof(disk_key));
ptr += sizeof(disk_key);
memcpy(ptr, chunk, item_size);
item_size += sizeof(disk_key);
btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
unlock_chunks(root);
return 0;
}
/*
* sort the devices in descending order by max_avail, total_avail
*/
static int btrfs_cmp_device_info(const void *a, const void *b)
{
const struct btrfs_device_info *di_a = a;
const struct btrfs_device_info *di_b = b;
if (di_a->max_avail > di_b->max_avail)
return -1;
if (di_a->max_avail < di_b->max_avail)
return 1;
if (di_a->total_avail > di_b->total_avail)
return -1;
if (di_a->total_avail < di_b->total_avail)
return 1;
return 0;
}
static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
.dev_stripes = 1,
.devs_max = 0, /* 0 == as many as possible */
.devs_min = 4,
.devs_increment = 2,
.ncopies = 2,
},
[BTRFS_RAID_RAID1] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 2,
.devs_min = 2,
.devs_increment = 2,
.ncopies = 2,
},
[BTRFS_RAID_DUP] = {
.sub_stripes = 1,
.dev_stripes = 2,
.devs_max = 1,
.devs_min = 1,
.devs_increment = 1,
.ncopies = 2,
},
[BTRFS_RAID_RAID0] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 2,
.devs_increment = 1,
.ncopies = 1,
},
[BTRFS_RAID_SINGLE] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 1,
.devs_min = 1,
.devs_increment = 1,
.ncopies = 1,
},
[BTRFS_RAID_RAID5] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 2,
.devs_increment = 1,
.ncopies = 2,
},
[BTRFS_RAID_RAID6] = {
.sub_stripes = 1,
.dev_stripes = 1,
.devs_max = 0,
.devs_min = 3,
.devs_increment = 1,
.ncopies = 3,
},
};
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
return 64 * 1024;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
}
#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
- sizeof(struct btrfs_item) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
- 2 * sizeof(struct btrfs_disk_key) \
- 2 * sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 start,
u64 type)
{
struct btrfs_fs_info *info = extent_root->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
struct list_head *cur;
struct map_lookup *map = NULL;
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_device_info *devices_info = NULL;
u64 total_avail;
int num_stripes; /* total number of stripes to allocate */
int data_stripes; /* number of stripes that count for
block group size */
int sub_stripes; /* sub_stripes info for map */
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
int ret;
u64 max_stripe_size;
u64 max_chunk_size;
u64 stripe_size;
u64 num_bytes;
u64 raid_stripe_len = BTRFS_STRIPE_LEN;
int ndevs;
int i;
int j;
int index;
BUG_ON(!alloc_profile_is_valid(type, 0));
if (list_empty(&fs_devices->alloc_list))
return -ENOSPC;
index = __get_raid_index(type);
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
devs_max = btrfs_raid_array[index].devs_max;
devs_min = btrfs_raid_array[index].devs_min;
devs_increment = btrfs_raid_array[index].devs_increment;
ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
max_stripe_size = 1024 * 1024 * 1024;
else
max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
} else {
btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
BUG_ON(1);
}
/* we don't want a chunk larger than 10% of writeable space */
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
cur = fs_devices->alloc_list.next;
/*
* in the first pass through the devices list, we gather information
* about the available holes on each device.
*/
ndevs = 0;
while (cur != &fs_devices->alloc_list) {
struct btrfs_device *device;
u64 max_avail;
u64 dev_offset;
device = list_entry(cur, struct btrfs_device, dev_alloc_list);
cur = cur->next;
if (!device->writeable) {
WARN(1, KERN_ERR
"BTRFS: read-only device in alloc_list\n");
continue;
}
if (!device->in_fs_metadata ||
device->is_tgtdev_for_dev_replace)
continue;
if (device->total_bytes > device->bytes_used)
total_avail = device->total_bytes - device->bytes_used;
else
total_avail = 0;
/* If there is no space on this device, skip it. */
if (total_avail == 0)
continue;
ret = find_free_dev_extent(trans, device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
goto error;
if (ret == 0)
max_avail = max_stripe_size * dev_stripes;
if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
continue;
if (ndevs == fs_devices->rw_devices) {
WARN(1, "%s: found more than %llu devices\n",
__func__, fs_devices->rw_devices);
break;
}
devices_info[ndevs].dev_offset = dev_offset;
devices_info[ndevs].max_avail = max_avail;
devices_info[ndevs].total_avail = total_avail;
devices_info[ndevs].dev = device;
++ndevs;
}
/*
* now sort the devices by hole size / available space
*/
sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
btrfs_cmp_device_info, NULL);
/* round down to number of usable stripes */
ndevs -= ndevs % devs_increment;
if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
ret = -ENOSPC;
goto error;
}
if (devs_max && ndevs > devs_max)
ndevs = devs_max;
/*
* the primary goal is to maximize the number of stripes, so use as many
* devices as possible, even if the stripes are not maximum sized.
*/
stripe_size = devices_info[ndevs-1].max_avail;
num_stripes = ndevs * dev_stripes;
/*
* this will have to be fixed for RAID1 and RAID10 over
* more drives
*/
data_stripes = num_stripes / ncopies;
if (type & BTRFS_BLOCK_GROUP_RAID5) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
btrfs_super_stripesize(info->super_copy));
data_stripes = num_stripes - 1;
}
if (type & BTRFS_BLOCK_GROUP_RAID6) {
raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
btrfs_super_stripesize(info->super_copy));
data_stripes = num_stripes - 2;
}
/*
* Use the number of data stripes to figure out how big this chunk
* is really going to be in terms of logical address space,
* and compare that answer with the max chunk size
*/
if (stripe_size * data_stripes > max_chunk_size) {
u64 mask = (1ULL << 24) - 1;
stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
stripe_size = (stripe_size + mask) & ~mask;
/* but don't go higher than the limits we found
* while searching for free extents
*/
if (stripe_size > devices_info[ndevs-1].max_avail)
stripe_size = devices_info[ndevs-1].max_avail;
}
stripe_size = div_u64(stripe_size, dev_stripes);
/* align to BTRFS_STRIPE_LEN */
stripe_size = div_u64(stripe_size, raid_stripe_len);
stripe_size *= raid_stripe_len;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
ret = -ENOMEM;
goto error;
}
map->num_stripes = num_stripes;
for (i = 0; i < ndevs; ++i) {
for (j = 0; j < dev_stripes; ++j) {
int s = i * dev_stripes + j;
map->stripes[s].dev = devices_info[i].dev;
map->stripes[s].physical = devices_info[i].dev_offset +
j * stripe_size;
}
}
map->sector_size = extent_root->sectorsize;
map->stripe_len = raid_stripe_len;
map->io_align = raid_stripe_len;
map->io_width = raid_stripe_len;
map->type = type;
map->sub_stripes = sub_stripes;
num_bytes = stripe_size * data_stripes;
trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
em = alloc_extent_map();
if (!em) {
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:52 +00:00
kfree(map);
ret = -ENOMEM;
goto error;
}
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:52 +00:00
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
em->orig_block_len = stripe_size;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
2013-04-05 20:51:15 +00:00
ret = add_extent_mapping(em_tree, em, 0);
if (!ret) {
list_add_tail(&em->list, &trans->transaction->pending_chunks);
atomic_inc(&em->refs);
}
write_unlock(&em_tree->lock);
if (ret) {
free_extent_map(em);
goto error;
}
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
start, num_bytes);
if (ret)
goto error_del_extent;
for (i = 0; i < map->num_stripes; i++) {
num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
}
spin_lock(&extent_root->fs_info->free_chunk_lock);
extent_root->fs_info->free_chunk_space -= (stripe_size *
map->num_stripes);
spin_unlock(&extent_root->fs_info->free_chunk_lock);
free_extent_map(em);
check_raid56_incompat_flag(extent_root->fs_info, type);
kfree(devices_info);
return 0;
error_del_extent:
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* One for our allocation */
free_extent_map(em);
/* One for the tree reference */
free_extent_map(em);
Btrfs: fix fs mapping extent map leak On chunk allocation error (label "error_del_extent"), after adding the extent map to the tree and to the pending chunks list, we would leave decrementing the extent map's refcount by 2 instead of 3 (our allocation + tree reference + list reference). Also, on chunk/block group removal, if the block group was on the list pending_chunks we weren't decrementing the respective list reference. Detected by 'rmmod btrfs': [20770.105881] kmem_cache_destroy btrfs_extent_map: Slab cache still has objects [20770.106127] CPU: 2 PID: 11093 Comm: rmmod Tainted: G W L 3.17.0-rc5-btrfs-next-1+ #1 [20770.106128] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [20770.106130] 0000000000000000 ffff8800ba867eb8 ffffffff813e7a13 ffff8800a2e11040 [20770.106132] ffff8800ba867ed0 ffffffff81105d0c 0000000000000000 ffff8800ba867ee0 [20770.106134] ffffffffa035d65e ffff8800ba867ef0 ffffffffa03b0654 ffff8800ba867f78 [20770.106136] Call Trace: [20770.106142] [<ffffffff813e7a13>] dump_stack+0x45/0x56 [20770.106145] [<ffffffff81105d0c>] kmem_cache_destroy+0x4b/0x90 [20770.106164] [<ffffffffa035d65e>] extent_map_exit+0x1a/0x1c [btrfs] [20770.106176] [<ffffffffa03b0654>] exit_btrfs_fs+0x27/0x9d3 [btrfs] [20770.106179] [<ffffffff8109dc97>] SyS_delete_module+0x153/0x1c4 [20770.106182] [<ffffffff8121261b>] ? trace_hardirqs_on_thunk+0x3a/0x3c [20770.106184] [<ffffffff813ebf52>] system_call_fastpath+0x16/0x1b This applies on top (depends on) of my previous patch titled: "Btrfs: fix race between fs trimming and block group remove/allocation" But the issue in fact was already present before that change, it only became easier to hit after Josef's 3.18 patch that added automatic removal of empty block groups. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-12-02 18:07:30 +00:00
/* One for the pending_chunks list reference */
free_extent_map(em);
error:
kfree(devices_info);
return ret;
}
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root,
u64 chunk_offset, u64 chunk_size)
{
struct btrfs_key key;
struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
struct btrfs_device *device;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
struct extent_map_tree *em_tree;
struct extent_map *em;
struct map_lookup *map;
size_t item_size;
u64 dev_offset;
u64 stripe_size;
int i = 0;
int ret;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_crit(extent_root->fs_info, "unable to find logical "
"%Lu len %Lu", chunk_offset, chunk_size);
return -EINVAL;
}
if (em->start != chunk_offset || em->len != chunk_size) {
btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
" %Lu-%Lu, found %Lu-%Lu", chunk_offset,
chunk_size, em->start, em->len);
free_extent_map(em);
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
chunk = kzalloc(item_size, GFP_NOFS);
if (!chunk) {
ret = -ENOMEM;
goto out;
}
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
goto out;
ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset,
stripe_size);
if (ret)
goto out;
}
stripe = &chunk->stripe;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
btrfs_set_stack_stripe_devid(stripe, device->devid);
btrfs_set_stack_stripe_offset(stripe, dev_offset);
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
btrfs_set_stack_chunk_type(chunk, map->type);
btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.type = BTRFS_CHUNK_ITEM_KEY;
key.offset = chunk_offset;
ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
/*
* TODO: Cleanup of inserted chunk root in case of
* failure.
*/
ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
item_size);
}
Btrfs: add initial tracepoint support for btrfs Tracepoints can provide insight into why btrfs hits bugs and be greatly helpful for debugging, e.g dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0 dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0 btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0) btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0) btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8 flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0) flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0) flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0) btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0) Here is what I have added: 1) ordere_extent: btrfs_ordered_extent_add btrfs_ordered_extent_remove btrfs_ordered_extent_start btrfs_ordered_extent_put These provide critical information to understand how ordered_extents are updated. 2) extent_map: btrfs_get_extent extent_map is used in both read and write cases, and it is useful for tracking how btrfs specific IO is running. 3) writepage: __extent_writepage btrfs_writepage_end_io_hook Pages are cirtical resourses and produce a lot of corner cases during writeback, so it is valuable to know how page is written to disk. 4) inode: btrfs_inode_new btrfs_inode_request btrfs_inode_evict These can show where and when a inode is created, when a inode is evicted. 5) sync: btrfs_sync_file btrfs_sync_fs These show sync arguments. 6) transaction: btrfs_transaction_commit In transaction based filesystem, it will be useful to know the generation and who does commit. 7) back reference and cow: btrfs_delayed_tree_ref btrfs_delayed_data_ref btrfs_delayed_ref_head btrfs_cow_block Btrfs natively supports back references, these tracepoints are helpful on understanding btrfs's COW mechanism. 8) chunk: btrfs_chunk_alloc btrfs_chunk_free Chunk is a link between physical offset and logical offset, and stands for space infomation in btrfs, and these are helpful on tracing space things. 9) reserved_extent: btrfs_reserved_extent_alloc btrfs_reserved_extent_free These can show how btrfs uses its space. Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 11:18:59 +00:00
out:
kfree(chunk);
free_extent_map(em);
return ret;
}
/*
* Chunk allocation falls into two parts. The first part does works
* that make the new allocated chunk useable, but not do any operation
* that modifies the chunk tree. The second part does the works that
* require modifying the chunk tree. This division is important for the
* bootstrap process of adding storage to a seed btrfs.
*/
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 type)
{
u64 chunk_offset;
ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex));
chunk_offset = find_next_chunk(extent_root->fs_info);
return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
}
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device)
{
u64 chunk_offset;
u64 sys_chunk_offset;
u64 alloc_profile;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
int ret;
chunk_offset = find_next_chunk(fs_info);
alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
alloc_profile);
if (ret)
return ret;
sys_chunk_offset = find_next_chunk(root->fs_info);
alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
alloc_profile);
return ret;
}
static inline int btrfs_chunk_max_errors(struct map_lookup *map)
{
int max_errors;
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID5 |
BTRFS_BLOCK_GROUP_DUP)) {
max_errors = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
max_errors = 2;
} else {
max_errors = 0;
}
return max_errors;
}
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
int readonly = 0;
int miss_ndevs = 0;
int i;
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
if (!em)
return 1;
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->missing) {
miss_ndevs++;
continue;
}
if (!map->stripes[i].dev->writeable) {
readonly = 1;
goto end;
}
}
/*
* If the number of missing devices is larger than max errors,
* we can not write the data into that chunk successfully, so
* set it readonly.
*/
if (miss_ndevs > btrfs_chunk_max_errors(map))
readonly = 1;
end:
free_extent_map(em);
return readonly;
}
void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
extent_map_tree_init(&tree->map_tree);
}
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
{
struct extent_map *em;
while (1) {
write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
write_unlock(&tree->map_tree.lock);
if (!em)
break;
/* once for us */
free_extent_map(em);
/* once for the tree */
free_extent_map(em);
}
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
read_unlock(&em_tree->lock);
/*
* We could return errors for these cases, but that could get ugly and
* we'd probably do the same thing which is just not do anything else
* and exit, so return 1 so the callers don't try to use other copies.
*/
if (!em) {
btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
logical+len);
return 1;
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
"%Lu-%Lu", logical, logical+len, em->start,
em->start + em->len);
free_extent_map(em);
return 1;
}
map = (struct map_lookup *)em->bdev;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
ret = 2;
else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
ret = 3;
else
ret = 1;
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
struct btrfs_mapping_tree *map_tree,
u64 logical)
{
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
unsigned long len = root->sectorsize;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
return len;
}
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first, int num,
int optimal, int dev_replace_is_ongoing)
{
int i;
int tolerance;
struct btrfs_device *srcdev;
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
srcdev = fs_info->dev_replace.srcdev;
else
srcdev = NULL;
/*
* try to avoid the drive that is the source drive for a
* dev-replace procedure, only choose it if no other non-missing
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
if (map->stripes[optimal].dev->bdev &&
(tolerance || map->stripes[optimal].dev != srcdev))
return optimal;
for (i = first; i < first + num; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
}
}
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
return optimal;
}
static inline int parity_smaller(u64 a, u64 b)
{
return a > b;
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
struct btrfs_bio_stripe s;
int i;
u64 l;
int again = 1;
while (again) {
again = 0;
for (i = 0; i < num_stripes - 1; i++) {
if (parity_smaller(bbio->raid_map[i],
bbio->raid_map[i+1])) {
s = bbio->stripes[i];
l = bbio->raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
bbio->raid_map[i] = bbio->raid_map[i+1];
bbio->stripes[i+1] = s;
bbio->raid_map[i+1] = l;
again = 1;
}
}
}
}
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
struct btrfs_bio *bbio = kzalloc(
/* the size of the btrfs_bio */
sizeof(struct btrfs_bio) +
/* plus the variable array for the stripes */
sizeof(struct btrfs_bio_stripe) * (total_stripes) +
/* plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
/*
* plus the raid_map, which includes both the tgt dev
* and the stripes
*/
sizeof(u64) * (total_stripes),
GFP_NOFS);
if (!bbio)
return NULL;
atomic_set(&bbio->error, 0);
atomic_set(&bbio->refs, 1);
return bbio;
}
void btrfs_get_bbio(struct btrfs_bio *bbio)
{
WARN_ON(!atomic_read(&bbio->refs));
atomic_inc(&bbio->refs);
}
void btrfs_put_bbio(struct btrfs_bio *bbio)
{
if (!bbio)
return;
if (atomic_dec_and_test(&bbio->refs))
kfree(bbio);
}
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
u64 stripe_end_offset;
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
u32 stripe_index;
int i;
int ret = 0;
int num_stripes;
int max_errors = 0;
int tgtdev_indexes = 0;
struct btrfs_bio *bbio = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
int num_alloc_stripes;
int patch_the_first_stripe_for_dev_replace = 0;
u64 physical_to_patch_in_first_stripe = 0;
u64 raid56_full_stripe_start = (u64)-1;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
if (!em) {
btrfs_crit(fs_info, "unable to find logical %llu len %llu",
logical, *length);
return -EINVAL;
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
"found %Lu-%Lu", logical, em->start,
em->start + em->len);
free_extent_map(em);
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
offset = logical - em->start;
stripe_len = map->stripe_len;
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
* to get to this block
*/
stripe_nr = div64_u64(stripe_nr, stripe_len);
stripe_offset = stripe_nr * stripe_len;
BUG_ON(offset < stripe_offset);
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
/* if we're here for raid56, we need to know the stripe aligned start */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
raid56_full_stripe_start = offset;
/* allow a write of a full stripe, but make sure we don't
* allow straddling of stripes
*/
raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
full_stripe_len);
raid56_full_stripe_start *= full_stripe_len;
}
if (rw & REQ_DISCARD) {
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
*length = min_t(u64, em->len - offset, *length);
} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
u64 max_len;
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
(rw & REQ_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
} else {
/* we limit the length of each bio to what fits in a stripe */
max_len = stripe_len - stripe_offset;
}
*length = min_t(u64, em->len - offset, max_len);
} else {
*length = em->len - offset;
}
/* This is for when we're called from btrfs_merge_bio_hook() and all
it cares about is the length */
if (!bbio_ret)
goto out;
btrfs_dev_replace_lock(dev_replace);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
dev_replace->tgtdev != NULL) {
/*
* in dev-replace case, for repair case (that's the only
* case where the mirror is selected explicitly when
* calling btrfs_map_block), blocks left of the left cursor
* can also be read from the target drive.
* For REQ_GET_READ_MIRRORS, the target drive is added as
* the last one to the array of stripes. For READ, it also
* needs to be supported using the same mirror number.
* If the requested block is not left of the left cursor,
* EIO is returned. This can happen because btrfs_num_copies()
* returns one more in the dev-replace case.
*/
u64 tmp_length = *length;
struct btrfs_bio *tmp_bbio = NULL;
int tmp_num_stripes;
u64 srcdev_devid = dev_replace->srcdev->devid;
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
logical, &tmp_length, &tmp_bbio, 0, 0);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
}
tmp_num_stripes = tmp_bbio->num_stripes;
if (mirror_num > tmp_num_stripes) {
/*
* REQ_GET_READ_MIRRORS does not contain this
* mirror, that means that the requested area
* is not left of the left cursor
*/
ret = -EIO;
btrfs_put_bbio(tmp_bbio);
goto out;
}
/*
* process the rest of the function using the mirror_num
* of the source drive. Therefore look it up first.
* At the end, patch the device pointer to the one of the
* target drive.
*/
for (i = 0; i < tmp_num_stripes; i++) {
if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it
* simple, only add the mirror with the
* lowest physical address
*/
if (found &&
physical_of_found <=
tmp_bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found =
tmp_bbio->stripes[i].physical;
}
}
if (found) {
mirror_num = index_srcdev + 1;
patch_the_first_stripe_for_dev_replace = 1;
physical_to_patch_in_first_stripe = physical_of_found;
} else {
WARN_ON(1);
ret = -EIO;
btrfs_put_bbio(tmp_bbio);
goto out;
}
btrfs_put_bbio(tmp_bbio);
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
num_stripes = 1;
stripe_index = 0;
stripe_nr_orig = stripe_nr;
stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
stripe_end_offset = stripe_nr_end * map->stripe_len -
(offset + *length);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->num_stripes,
stripe_nr_end - stripe_nr_orig);
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
mirror_num = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
map->num_stripes,
current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
num_stripes = map->num_stripes;
} else if (mirror_num) {
stripe_index = mirror_num - 1;
} else {
mirror_num = 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
u32 factor = map->num_stripes / map->sub_stripes;
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
stripe_index *= map->sub_stripes;
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
num_stripes = map->sub_stripes;
else if (rw & REQ_DISCARD)
num_stripes = min_t(u64, map->sub_stripes *
(stripe_nr_end - stripe_nr_orig),
map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
map->sub_stripes, stripe_index +
current->pid % map->sub_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
if (need_raid_map &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
/* push stripe_nr back to the start of the full stripe */
stripe_nr = div_u64(raid56_full_stripe_start,
stripe_len * nr_data_stripes(map));
/* RAID[56] write or recovery. Return all stripes */
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
* Mirror #3 is RAID6 Q block.
*/
stripe_nr = div_u64_rem(stripe_nr,
nr_data_stripes(map), &stripe_index);
if (mirror_num > 1)
stripe_index = nr_data_stripes(map) +
mirror_num - 2;
/* We distribute the parity blocks across stripes */
div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
&stripe_index);
if (!(rw & (REQ_WRITE | REQ_DISCARD |
REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
mirror_num = 1;
}
} else {
/*
* after this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
&stripe_index);
mirror_num = stripe_index + 1;
}
BUG_ON(stripe_index >= map->num_stripes);
num_alloc_stripes = num_stripes;
if (dev_replace_is_ongoing) {
if (rw & (REQ_WRITE | REQ_DISCARD))
num_alloc_stripes <<= 1;
if (rw & REQ_GET_READ_MIRRORS)
num_alloc_stripes++;
tgtdev_indexes = num_stripes;
}
bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
ret = -ENOMEM;
goto out;
}
if (dev_replace_is_ongoing)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
/* build raid_map */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
u64 tmp;
unsigned rot;
bbio->raid_map = (u64 *)((void *)bbio->stripes +
sizeof(struct btrfs_bio_stripe) *
num_alloc_stripes +
sizeof(int) * tgtdev_indexes);
/* Work out the disk rotation on this stripe-set */
div_u64_rem(stripe_nr, num_stripes, &rot);
/* Fill in the logical address of each stripe */
tmp = stripe_nr * nr_data_stripes(map);
for (i = 0; i < nr_data_stripes(map); i++)
bbio->raid_map[(i+rot) % num_stripes] =
em->start + (tmp + i) * map->stripe_len;
bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
if (map->type & BTRFS_BLOCK_GROUP_RAID6)
bbio->raid_map[(i+rot+1) % num_stripes] =
RAID6_Q_STRIPE;
}
if (rw & REQ_DISCARD) {
u32 factor = 0;
u32 sub_stripes = 0;
u64 stripes_per_dev = 0;
u32 remaining_stripes = 0;
u32 last_stripe = 0;
if (map->type &
(BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
if (map->type & BTRFS_BLOCK_GROUP_RAID0)
sub_stripes = 1;
else
sub_stripes = map->sub_stripes;
factor = map->num_stripes / sub_stripes;
stripes_per_dev = div_u64_rem(stripe_nr_end -
stripe_nr_orig,
factor,
&remaining_stripes);
div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
last_stripe *= sub_stripes;
}
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10)) {
bbio->stripes[i].length = stripes_per_dev *
map->stripe_len;
if (i / sub_stripes < remaining_stripes)
bbio->stripes[i].length +=
map->stripe_len;
/*
* Special for the first stripe and
* the last stripe:
*
* |-------|...|-------|
* |----------|
* off end_off
*/
if (i < sub_stripes)
bbio->stripes[i].length -=
stripe_offset;
if (stripe_index >= last_stripe &&
stripe_index <= (last_stripe +
sub_stripes - 1))
bbio->stripes[i].length -=
stripe_end_offset;
if (i == sub_stripes - 1)
stripe_offset = 0;
} else
bbio->stripes[i].length = *length;
stripe_index++;
if (stripe_index == map->num_stripes) {
/* This could only happen for RAID0/10 */
stripe_index = 0;
stripe_nr++;
}
}
} else {
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset +
stripe_nr * map->stripe_len;
bbio->stripes[i].dev =
map->stripes[stripe_index].dev;
stripe_index++;
}
}
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
max_errors = btrfs_chunk_max_errors(map);
if (bbio->raid_map)
sort_parity_stripes(bbio, num_stripes);
tgtdev_indexes = 0;
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
int index_where_to_add;
u64 srcdev_devid = dev_replace->srcdev->devid;
/*
* duplicate the write operations while the dev replace
* procedure is running. Since the copying of the old disk
* to the new disk takes place at run time while the
* filesystem is mounted writable, the regular write
* operations to the old disk have to be duplicated to go
* to the new disk as well.
* Note that device->missing is handled by the caller, and
* that the write to the old disk is already set up in the
* stripes array.
*/
index_where_to_add = num_stripes;
for (i = 0; i < num_stripes; i++) {
if (bbio->stripes[i].dev->devid == srcdev_devid) {
/* write to new disk, too */
struct btrfs_bio_stripe *new =
bbio->stripes + index_where_to_add;
struct btrfs_bio_stripe *old =
bbio->stripes + i;
new->physical = old->physical;
new->length = old->length;
new->dev = dev_replace->tgtdev;
bbio->tgtdev_map[i] = index_where_to_add;
index_where_to_add++;
max_errors++;
tgtdev_indexes++;
}
}
num_stripes = index_where_to_add;
} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
dev_replace->tgtdev != NULL) {
u64 srcdev_devid = dev_replace->srcdev->devid;
int index_srcdev = 0;
int found = 0;
u64 physical_of_found = 0;
/*
* During the dev-replace procedure, the target drive can
* also be used to read data in case it is needed to repair
* a corrupt block elsewhere. This is possible if the
* requested area is left of the left cursor. In this area,
* the target drive is a full copy of the source drive.
*/
for (i = 0; i < num_stripes; i++) {
if (bbio->stripes[i].dev->devid == srcdev_devid) {
/*
* In case of DUP, in order to keep it
* simple, only add the mirror with the
* lowest physical address
*/
if (found &&
physical_of_found <=
bbio->stripes[i].physical)
continue;
index_srcdev = i;
found = 1;
physical_of_found = bbio->stripes[i].physical;
}
}
if (found) {
if (physical_of_found + map->stripe_len <=
dev_replace->cursor_left) {
struct btrfs_bio_stripe *tgtdev_stripe =
bbio->stripes + num_stripes;
tgtdev_stripe->physical = physical_of_found;
tgtdev_stripe->length =
bbio->stripes[index_srcdev].length;
tgtdev_stripe->dev = dev_replace->tgtdev;
bbio->tgtdev_map[index_srcdev] = num_stripes;
tgtdev_indexes++;
num_stripes++;
}
}
}
*bbio_ret = bbio;
bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
bbio->num_tgtdevs = tgtdev_indexes;
/*
* this is the case that REQ_READ && dev_replace_is_ongoing &&
* mirror_num == num_stripes + 1 && dev_replace target drive is
* available as a mirror
*/
if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
WARN_ON(num_stripes > 1);
bbio->stripes[0].dev = dev_replace->tgtdev;
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
free_extent_map(em);
return ret;
}
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
int need_raid_map)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
mirror_num, need_raid_map);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len)
{
struct extent_map_tree *em_tree = &map_tree->map_tree;
struct extent_map *em;
struct map_lookup *map;
u64 *buf;
u64 bytenr;
u64 length;
u64 stripe_nr;
u64 rmap_len;
int i, j, nr = 0;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_start, 1);
read_unlock(&em_tree->lock);
if (!em) {
printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
chunk_start);
return -EIO;
}
if (em->start != chunk_start) {
printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
em->start, chunk_start);
free_extent_map(em);
return -EIO;
}
map = (struct map_lookup *)em->bdev;
length = em->len;
rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = div_u64(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = div_u64(length, map->num_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
length = div_u64(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
BUG_ON(!buf); /* -ENOMEM */
for (i = 0; i < map->num_stripes; i++) {
if (devid && map->stripes[i].dev->devid != devid)
continue;
if (map->stripes[i].physical > physical ||
map->stripes[i].physical + length <= physical)
continue;
stripe_nr = physical - map->stripes[i].physical;
stripe_nr = div_u64(stripe_nr, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
stripe_nr = div_u64(stripe_nr, map->sub_stripes);
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
} /* else if RAID[56], multiply by nr_data_stripes().
* Alternatively, just use rmap_len below instead of
* map->stripe_len */
bytenr = chunk_start + stripe_nr * rmap_len;
WARN_ON(nr >= map->num_stripes);
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
}
if (j == nr) {
WARN_ON(nr >= map->num_stripes);
buf[nr++] = bytenr;
}
}
*logical = buf;
*naddrs = nr;
*stripe_len = rmap_len;
free_extent_map(em);
return 0;
}
static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
{
if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
bio_endio_nodec(bio, err);
else
bio_endio(bio, err);
btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
atomic_inc(&bbio->error);
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
if (dev->bdev) {
if (bio->bi_rw & WRITE)
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_WRITE_ERRS);
else
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_READ_ERRS);
if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
btrfs_dev_stat_inc(dev,
BTRFS_DEV_STAT_FLUSH_ERRS);
btrfs_dev_stat_print_on_error(dev);
}
}
}
if (bio == bbio->orig_bio)
is_orig_bio = 1;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
btrfs_bio_counter_dec(bbio->fs_info);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
bio = bbio->orig_bio;
}
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
/* only send an error to the higher layers if it is
* beyond the tolerance of the btrfs bio
*/
if (atomic_read(&bbio->error) > bbio->max_errors) {
err = -EIO;
} else {
/*
* this bio is actually up to date, we didn't
* go over the max number of errors
*/
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
*
* This will add one bio to the pending list for a device and make sure
* the work struct is scheduled.
*/
static noinline void btrfs_schedule_bio(struct btrfs_root *root,
struct btrfs_device *device,
int rw, struct bio *bio)
{
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
if (device->missing || !device->bdev) {
bio_endio(bio, -EIO);
return;
}
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
btrfsic_submit_bio(rw, bio);
bio_put(bio);
return;
}
/*
* nr_async_bios allows us to reliably return congestion to the
* higher layers. Otherwise, the async bio makes it appear we have
* made progress against dirty pages when we've really just put it
* on a queue for later
*/
atomic_inc(&root->fs_info->nr_async_bios);
WARN_ON(bio->bi_next);
bio->bi_next = NULL;
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
if (bio->bi_rw & REQ_SYNC)
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
if (pending_bios->tail)
pending_bios->tail->bi_next = bio;
pending_bios->tail = bio;
if (!pending_bios->head)
pending_bios->head = bio;
if (device->running_pending)
should_queue = 0;
spin_unlock(&device->io_lock);
if (should_queue)
btrfs_queue_work(root->fs_info->submit_workers,
&device->work);
}
static int bio_size_ok(struct block_device *bdev, struct bio *bio,
sector_t sector)
{
struct bio_vec *prev;
struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_sectors = queue_max_sectors(q);
struct bvec_merge_data bvm = {
.bi_bdev = bdev,
.bi_sector = sector,
.bi_rw = bio->bi_rw,
};
if (WARN_ON(bio->bi_vcnt == 0))
return 1;
prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bio_sectors(bio) > max_sectors)
return 0;
if (!q->merge_bvec_fn)
return 1;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
return 0;
return 1;
}
static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct bio *bio, u64 physical, int dev_nr,
int rw, int async)
{
struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
bio->bi_end_io = btrfs_end_bio;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_sector = physical >> 9;
#ifdef DEBUG
{
struct rcu_string *name;
rcu_read_lock();
name = rcu_dereference(dev->name);
pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
"(%s id %llu), size=%u\n", rw,
(u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
name->str, dev->devid, bio->bi_iter.bi_size);
rcu_read_unlock();
}
#endif
bio->bi_bdev = dev->bdev;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
btrfs_bio_counter_inc_noblocked(root->fs_info);
if (async)
btrfs_schedule_bio(root, dev, rw, bio);
else
btrfsic_submit_bio(rw, bio);
}
static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
struct bio *first_bio, struct btrfs_device *dev,
int dev_nr, int rw, int async)
{
struct bio_vec *bvec = first_bio->bi_io_vec;
struct bio *bio;
int nr_vecs = bio_get_nr_vecs(dev->bdev);
u64 physical = bbio->stripes[dev_nr].physical;
again:
bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
if (!bio)
return -ENOMEM;
while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
bvec->bv_offset) < bvec->bv_len) {
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
u64 len = bio->bi_iter.bi_size;
atomic_inc(&bbio->stripes_pending);
submit_stripe_bio(root, bbio, bio, physical, dev_nr,
rw, async);
physical += len;
goto again;
}
bvec++;
}
submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
return 0;
}
static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
/* Shoud be the original bio. */
WARN_ON(bio != bbio->orig_bio);
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
bio->bi_iter.bi_sector = logical >> 9;
btrfs_end_bbio(bbio, bio, -EIO);
}
}
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int mirror_num, int async_submit)
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
int dev_nr;
int total_devs;
struct btrfs_bio *bbio = NULL;
block: Abstract out bvec iterator Immutable biovecs are going to require an explicit iterator. To implement immutable bvecs, a later patch is going to add a bi_bvec_done member to this struct; for now, this patch effectively just renames things. Signed-off-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Ed L. Cashin" <ecashin@coraid.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Lars Ellenberg <drbd-dev@lists.linbit.com> Cc: Jiri Kosina <jkosina@suse.cz> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Geoff Levand <geoff@infradead.org> Cc: Yehuda Sadeh <yehuda@inktank.com> Cc: Sage Weil <sage@inktank.com> Cc: Alex Elder <elder@inktank.com> Cc: ceph-devel@vger.kernel.org Cc: Joshua Morris <josh.h.morris@us.ibm.com> Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Neil Brown <neilb@suse.de> Cc: Alasdair Kergon <agk@redhat.com> Cc: Mike Snitzer <snitzer@redhat.com> Cc: dm-devel@redhat.com Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: linux390@de.ibm.com Cc: Boaz Harrosh <bharrosh@panasas.com> Cc: Benny Halevy <bhalevy@tonian.com> Cc: "James E.J. Bottomley" <JBottomley@parallels.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Chris Mason <chris.mason@fusionio.com> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Dave Kleikamp <shaggy@kernel.org> Cc: Joern Engel <joern@logfs.org> Cc: Prasad Joshi <prasadjoshi.linux@gmail.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Ben Myers <bpm@sgi.com> Cc: xfs@oss.sgi.com Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Len Brown <len.brown@intel.com> Cc: Pavel Machek <pavel@ucw.cz> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com> Cc: Ben Hutchings <ben@decadent.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Guo Chao <yan@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: "Roger Pau Monné" <roger.pau@citrix.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com> Cc: Ian Campbell <Ian.Campbell@citrix.com> Cc: Sebastian Ott <sebott@linux.vnet.ibm.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Jerome Marchand <jmarchand@redhat.com> Cc: Joe Perches <joe@perches.com> Cc: Peng Tao <tao.peng@emc.com> Cc: Andy Adamson <andros@netapp.com> Cc: fanchaoting <fanchaoting@cn.fujitsu.com> Cc: Jie Liu <jeff.liu@oracle.com> Cc: Sunil Mushran <sunil.mushran@gmail.com> Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Namjae Jeon <namjae.jeon@samsung.com> Cc: Pankaj Kumar <pankaj.km@samsung.com> Cc: Dan Magenheimer <dan.magenheimer@oracle.com> Cc: Mel Gorman <mgorman@suse.de>6
2013-10-11 22:44:27 +00:00
length = bio->bi_iter.bi_size;
map_length = length;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num, 1);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
}
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if (bbio->raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
ret = raid56_parity_write(root, bio, bbio, map_length);
} else {
ret = raid56_parity_recover(root, bio, bbio, map_length,
mirror_num, 1);
}
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
btrfs_bio_counter_dec(root->fs_info);
return ret;
}
if (map_length < length) {
btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
logical, length, map_length);
BUG();
}
for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
dev = bbio->stripes[dev_nr].dev;
if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
bbio_error(bbio, first_bio, logical);
continue;
}
/*
* Check and see if we're ok with this bio based on it's size
* and offset with the given device.
*/
if (!bio_size_ok(dev->bdev, first_bio,
bbio->stripes[dev_nr].physical >> 9)) {
ret = breakup_stripe_bio(root, bbio, first_bio, dev,
dev_nr, rw, async_submit);
BUG_ON(ret);
continue;
}
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(first_bio, GFP_NOFS);
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
bbio->stripes[dev_nr].physical, dev_nr, rw,
async_submit);
}
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
btrfs_bio_counter_dec(root->fs_info);
return 0;
}
struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
u8 *uuid, u8 *fsid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
cur_devices = fs_info->fs_devices;
while (cur_devices) {
if (!fsid ||
!memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
device = __find_device(&cur_devices->devices,
devid, uuid);
if (device)
return device;
}
cur_devices = cur_devices->seed;
}
return NULL;
}
static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
if (IS_ERR(device))
return NULL;
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
fs_devices->num_devices++;
device->missing = 1;
fs_devices->missing_devices++;
return device;
}
/**
* btrfs_alloc_device - allocate struct btrfs_device
* @fs_info: used only for generating a new devid, can be NULL if
* devid is provided (i.e. @devid != NULL).
* @devid: a pointer to devid for this device. If NULL a new devid
* is generated.
* @uuid: a pointer to UUID for this device. If NULL a new UUID
* is generated.
*
* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
* on error. Returned struct is not linked onto any lists and can be
* destroyed with kfree() right away.
*/
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid)
{
struct btrfs_device *dev;
u64 tmp;
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
dev = __alloc_device();
if (IS_ERR(dev))
return dev;
if (devid)
tmp = *devid;
else {
int ret;
ret = find_next_devid(fs_info, &tmp);
if (ret) {
kfree(dev);
return ERR_PTR(ret);
}
}
dev->devid = tmp;
if (uuid)
memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
else
generate_random_uuid(dev->uuid);
Btrfs: fix task hang under heavy compressed write This has been reported and discussed for a long time, and this hang occurs in both 3.15 and 3.16. Btrfs now migrates to use kernel workqueue, but it introduces this hang problem. Btrfs has a kind of work queued as an ordered way, which means that its ordered_func() must be processed in the way of FIFO, so it usually looks like -- normal_work_helper(arg) work = container_of(arg, struct btrfs_work, normal_work); work->func() <---- (we name it work X) for ordered_work in wq->ordered_list ordered_work->ordered_func() ordered_work->ordered_free() The hang is a rare case, first when we find free space, we get an uncached block group, then we go to read its free space cache inode for free space information, so it will file a readahead request btrfs_readpages() for page that is not in page cache __do_readpage() submit_extent_page() btrfs_submit_bio_hook() btrfs_bio_wq_end_io() submit_bio() end_workqueue_bio() <--(ret by the 1st endio) queue a work(named work Y) for the 2nd also the real endio() So the hang occurs when work Y's work_struct and work X's work_struct happens to share the same address. A bit more explanation, A,B,C -- struct btrfs_work arg -- struct work_struct kthread: worker_thread() pick up a work_struct from @worklist process_one_work(arg) worker->current_work = arg; <-- arg is A->normal_work worker->current_func(arg) normal_work_helper(arg) A = container_of(arg, struct btrfs_work, normal_work); A->func() A->ordered_func() A->ordered_free() <-- A gets freed B->ordered_func() submit_compressed_extents() find_free_extent() load_free_space_inode() ... <-- (the above readhead stack) end_workqueue_bio() btrfs_queue_work(work C) B->ordered_free() As if work A has a high priority in wq->ordered_list and there are more ordered works queued after it, such as B->ordered_func(), its memory could have been freed before normal_work_helper() returns, which means that kernel workqueue code worker_thread() still has worker->current_work pointer to be work A->normal_work's, ie. arg's address. Meanwhile, work C is allocated after work A is freed, work C->normal_work and work A->normal_work are likely to share the same address(I confirmed this with ftrace output, so I'm not just guessing, it's rare though). When another kthread picks up work C->normal_work to process, and finds our kthread is processing it(see find_worker_executing_work()), it'll think work C as a collision and skip then, which ends up nobody processing work C. So the situation is that our kthread is waiting forever on work C. Besides, there're other cases that can lead to deadlock, but the real problem is that all btrfs workqueue shares one work->func, -- normal_work_helper, so this makes each workqueue to have its own helper function, but only a wraper pf normal_work_helper. With this patch, I no long hit the above hang. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 15:36:53 +00:00
btrfs_init_work(&dev->work, btrfs_submit_helper,
pending_bios_fn, NULL, NULL);
return dev;
}
static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
struct map_lookup *map;
struct extent_map *em;
u64 logical;
u64 length;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
int i;
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
read_unlock(&map_tree->map_tree.lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
free_extent_map(em);
return 0;
} else if (em) {
free_extent_map(em);
}
em = alloc_extent_map();
if (!em)
return -ENOMEM;
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
return -ENOMEM;
}
Btrfs: fix NULL pointer crash when running balance and scrub concurrently While running balance, scrub, fsstress concurrently we hit the following kernel crash: [56561.448845] BTRFS info (device sde): relocating block group 11005853696 flags 132 [56561.524077] BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [56561.524237] IP: [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.524297] PGD 9be28067 PUD 7f3dd067 PMD 0 [56561.524325] Oops: 0000 [#1] SMP [....] [56561.527237] Call Trace: [56561.527309] [<ffffffffa038980e>] scrub_enumerate_chunks+0x24e/0x490 [btrfs] [56561.527392] [<ffffffff810abe00>] ? abort_exclusive_wait+0x50/0xb0 [56561.527476] [<ffffffffa038add4>] btrfs_scrub_dev+0x1a4/0x530 [btrfs] [56561.527561] [<ffffffffa0368107>] btrfs_ioctl+0x13f7/0x2a90 [btrfs] [56561.527639] [<ffffffff811c82f0>] do_vfs_ioctl+0x2e0/0x4c0 [56561.527712] [<ffffffff8109c384>] ? vtime_account_user+0x54/0x60 [56561.527788] [<ffffffff810f768c>] ? __audit_syscall_entry+0x9c/0xf0 [56561.527870] [<ffffffff811c8551>] SyS_ioctl+0x81/0xa0 [56561.527941] [<ffffffff815707f7>] tracesys+0xdd/0xe2 [...] [56561.528304] RIP [<ffffffffa038956d>] scrub_chunk.isra.12+0xdd/0x130 [btrfs] [56561.528395] RSP <ffff88004c0f5be8> [56561.528454] CR2: 0000000000000078 This is because in btrfs_relocate_chunk(), we will free @bdev directly while scrub may still hold extent mapping, and may access freed memory. Fix this problem by wrapping freeing @bdev work into free_extent_map() which is based on reference count. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 02:42:52 +00:00
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
em->orig_start = 0;
em->block_start = 0;
Btrfs: Add zlib compression support This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 18:49:59 +00:00
em->block_len = em->len;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
devid = btrfs_stripe_devid_nr(leaf, chunk, i);
read_extent_buffer(leaf, uuid, (unsigned long)
btrfs_stripe_dev_uuid_nr(chunk, i),
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
free_extent_map(em);
return -EIO;
}
if (!map->stripes[i].dev) {
map->stripes[i].dev =
add_missing_dev(root, root->fs_info->fs_devices,
devid, uuid);
if (!map->stripes[i].dev) {
free_extent_map(em);
return -EIO;
}
}
map->stripes[i].dev->in_fs_metadata = 1;
}
write_lock(&map_tree->map_tree.lock);
2013-04-05 20:51:15 +00:00
ret = add_extent_mapping(&map_tree->map_tree, em, 0);
write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret); /* Tree corruption */
free_extent_map(em);
return 0;
}
static void fill_device_from_item(struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item,
struct btrfs_device *device)
{
unsigned long ptr;
device->devid = btrfs_device_id(leaf, dev_item);
device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
device->total_bytes = device->disk_total_bytes;
device->commit_total_bytes = device->disk_total_bytes;
device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
device->commit_bytes_used = device->bytes_used;
device->type = btrfs_device_type(leaf, dev_item);
device->io_align = btrfs_device_io_align(leaf, dev_item);
device->io_width = btrfs_device_io_width(leaf, dev_item);
device->sector_size = btrfs_device_sector_size(leaf, dev_item);
WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
device->is_tgtdev_for_dev_replace = 0;
ptr = btrfs_device_uuid(dev_item);
read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
}
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
u8 *fsid)
{
struct btrfs_fs_devices *fs_devices;
int ret;
BUG_ON(!mutex_is_locked(&uuid_mutex));
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
return fs_devices;
fs_devices = fs_devices->seed;
}
fs_devices = find_fsid(fsid);
if (!fs_devices) {
if (!btrfs_test_opt(root, DEGRADED))
return ERR_PTR(-ENOENT);
fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
return fs_devices;
fs_devices->seeding = 1;
fs_devices->opened = 1;
return fs_devices;
}
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
ret = __btrfs_open_devices(fs_devices, FMODE_READ,
root->fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(ret);
goto out;
}
if (!fs_devices->seeding) {
__btrfs_close_devices(fs_devices);
free_fs_devices(fs_devices);
fs_devices = ERR_PTR(-EINVAL);
goto out;
}
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
return fs_devices;
}
static int read_one_dev(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dev_item *dev_item)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
u64 devid;
int ret;
u8 fs_uuid[BTRFS_UUID_SIZE];
u8 dev_uuid[BTRFS_UUID_SIZE];
devid = btrfs_device_id(leaf, dev_item);
read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
fs_devices = open_seed_devices(root, fs_uuid);
if (IS_ERR(fs_devices))
return PTR_ERR(fs_devices);
}
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
if (!btrfs_test_opt(root, DEGRADED))
return -EIO;
btrfs_warn(root->fs_info, "devid %llu missing", devid);
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
if (!device)
return -ENOMEM;
} else {
if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
return -EIO;
if(!device->bdev && !device->missing) {
/*
* this happens when a device that was properly setup
* in the device info lists suddenly goes bad.
* device->bdev is NULL, and so we have to set
* device->missing to one here
*/
device->fs_devices->missing_devices++;
device->missing = 1;
}
/* Move the device to its own fs_devices */
if (device->fs_devices != fs_devices) {
ASSERT(device->missing);
list_move(&device->dev_list, &fs_devices->devices);
device->fs_devices->num_devices--;
fs_devices->num_devices++;
device->fs_devices->missing_devices--;
fs_devices->missing_devices++;
device->fs_devices = fs_devices;
}
}
if (device->fs_devices != root->fs_info->fs_devices) {
BUG_ON(device->writeable);
if (device->generation !=
btrfs_device_generation(leaf, dev_item))
return -EINVAL;
}
fill_device_from_item(leaf, dev_item, device);
device->in_fs_metadata = 1;
if (device->writeable && !device->is_tgtdev_for_dev_replace) {
device->fs_devices->total_rw_bytes += device->total_bytes;
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += device->total_bytes -
device->bytes_used;
spin_unlock(&root->fs_info->free_chunk_lock);
}
ret = 0;
return ret;
}
int btrfs_read_sys_array(struct btrfs_root *root)
{
struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur_offset;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
/*
* This will create extent buffer of nodesize, superblock size is
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
*/
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artifical and just used to read the system array.
* btrfs_set_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
* write_extent_buffer then triggers a WARN_ON.
*
* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
array_ptr = super_copy->sys_chunk_array;
sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
cur_offset = 0;
while (cur_offset < array_size) {
disk_key = (struct btrfs_disk_key *)array_ptr;
len = sizeof(*disk_key);
if (cur_offset + len > array_size)
goto out_short_read;
btrfs_disk_key_to_cpu(&key, disk_key);
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
chunk = (struct btrfs_chunk *)sb_array_offset;
/*
* At least one btrfs_chunk with one stripe must be
* present, exact stripe count check comes afterwards
*/
len = btrfs_chunk_item_size(1);
if (cur_offset + len > array_size)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
ret = read_one_chunk(root, &key, sb, chunk);
if (ret)
break;
} else {
ret = -EIO;
break;
}
array_ptr += len;
sb_array_offset += len;
cur_offset += len;
}
free_extent_buffer(sb);
return ret;
out_short_read:
printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
len, cur_offset);
free_extent_buffer(sb);
return -EIO;
}
int btrfs_read_chunk_tree(struct btrfs_root *root)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
int slot;
root = root->fs_info->chunk_root;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
mutex_lock(&uuid_mutex);
lock_chunks(root);
/*
* Read all device items, and then all the chunk items. All
* device items are found before any chunk item (their object id
* is smaller than the lowest possible object id for a chunk
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto error;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto error;
break;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
struct btrfs_dev_item);
ret = read_one_dev(root, leaf, dev_item);
if (ret)
goto error;
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
ret = read_one_chunk(root, &found_key, leaf, chunk);
if (ret)
goto error;
}
path->slots[0]++;
}
ret = 0;
error:
unlock_chunks(root);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
return ret;
}
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
while (fs_devices) {
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->dev_root = fs_info->dev_root;
mutex_unlock(&fs_devices->device_list_mutex);
fs_devices = fs_devices->seed;
}
}
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
{
int i;
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_dev_stat_reset(dev, i);
}
int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct extent_buffer *eb;
int slot;
int ret = 0;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
int i;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
int item_size;
struct btrfs_dev_stats_item *ptr;
key.objectid = 0;
key.type = BTRFS_DEV_STATS_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
__btrfs_reset_dev_stats(device);
device->dev_stats_valid = 1;
btrfs_release_path(path);
continue;
}
slot = path->slots[0];
eb = path->nodes[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot,
struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (item_size >= (1 + i) * sizeof(__le64))
btrfs_dev_stat_set(device, i,
btrfs_dev_stats_value(eb, ptr, i));
else
btrfs_dev_stat_reset(device, i);
}
device->dev_stats_valid = 1;
btrfs_dev_stat_print_on_load(device);
btrfs_release_path(path);
}
mutex_unlock(&fs_devices->device_list_mutex);
out:
btrfs_free_path(path);
return ret < 0 ? ret : 0;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
struct btrfs_root *dev_root,
struct btrfs_device *device)
{
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_stats_item *ptr;
int ret;
int i;
key.objectid = 0;
key.type = BTRFS_DEV_STATS_KEY;
key.offset = device->devid;
path = btrfs_alloc_path();
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
printk_in_rcu(KERN_WARNING "BTRFS: "
"error %d while searching for dev_stats item for device %s!\n",
ret, rcu_str_deref(device->name));
goto out;
}
if (ret == 0 &&
btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
printk_in_rcu(KERN_WARNING "BTRFS: "
"delete too small dev_stats item for device %s failed %d!\n",
rcu_str_deref(device->name), ret);
goto out;
}
ret = 1;
}
if (ret == 1) {
/* need to insert a new item */
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
printk_in_rcu(KERN_WARNING "BTRFS: "
"insert dev_stats item for device %s failed %d!\n",
rcu_str_deref(device->name), ret);
goto out;
}
}
eb = path->nodes[0];
ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
btrfs_set_dev_stats_value(eb, ptr, i,
btrfs_dev_stat_read(device, i));
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
/*
* called from commit_transaction. Writes all changed device stats to disk.
*/
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int stats_cnt;
int ret = 0;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
continue;
stats_cnt = atomic_read(&device->dev_stats_ccnt);
ret = update_dev_stat_item(trans, dev_root, device);
if (!ret)
atomic_sub(stats_cnt, &device->dev_stats_ccnt);
}
mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
btrfs_dev_stat_inc(dev, index);
btrfs_dev_stat_print_on_error(dev);
}
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
int i;
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (btrfs_dev_stat_read(dev, i) != 0)
break;
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
printk_in_rcu(KERN_INFO "BTRFS: "
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
int btrfs_get_dev_stats(struct btrfs_root *root,
struct btrfs_ioctl_get_dev_stats *stats)
{
struct btrfs_device *dev;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int i;
mutex_lock(&fs_devices->device_list_mutex);
dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
return -ENODEV;
} else if (!dev->dev_stats_valid) {
btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
return -ENODEV;
} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
if (stats->nr_items > i)
stats->values[i] =
btrfs_dev_stat_read_and_reset(dev, i);
else
btrfs_dev_stat_reset(dev, i);
}
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)
stats->values[i] = btrfs_dev_stat_read(dev, i);
}
if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
return 0;
}
int btrfs_scratch_superblock(struct btrfs_device *device)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
bh = btrfs_read_dev_super(device->bdev);
if (!bh)
return -EINVAL;
disk_super = (struct btrfs_super_block *)bh->b_data;
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
brelse(bh);
return 0;
}
/*
* Update the size of all devices, which is used for writing out the
* super blocks.
*/
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *curr, *next;
if (list_empty(&fs_devices->resized_devices))
return;
mutex_lock(&fs_devices->device_list_mutex);
lock_chunks(fs_info->dev_root);
list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
resized_list) {
list_del_init(&curr->resized_list);
curr->commit_total_bytes = curr->disk_total_bytes;
}
unlock_chunks(fs_info->dev_root);
mutex_unlock(&fs_devices->device_list_mutex);
}
/* Must be invoked during the transaction commit */
void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
struct btrfs_transaction *transaction)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
int i;
if (list_empty(&transaction->pending_chunks))
return;
/* In order to kick the device replace finish process */
lock_chunks(root);
list_for_each_entry(em, &transaction->pending_chunks, list) {
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
dev->commit_bytes_used = dev->bytes_used;
}
}
unlock_chunks(root);
}
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
while (fs_devices) {
fs_devices->fs_info = fs_info;
fs_devices = fs_devices->seed;
}
}
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
while (fs_devices) {
fs_devices->fs_info = NULL;
fs_devices = fs_devices->seed;
}
}