bcachefs fixes for 6.12-rc6

Various syzbot fixes, and the more notable ones:
 
 - Fix for pointers in an extent overflowing the max (16) on a filesystem
   with many devices: we were creating too many cached copies when moving
   data around. Now, we only create at most one cached copy if there's a
   promote target set.
 
   Caching will be a bit broken for reflinked data until 6.13: I have
   larger series queued up which significantly improves the plumbing for
   data options down into the extent (bch_extent_rebalance) to fix this.
 
 - Fix for deadlock on -ENOSPC on tiny filesystems
 
   Allocation from the partial open_bucket list wasn't correctly
   accounting partial open_buckets as free: this fixes the main cause of
   tests timing out in the automated tests.
 -----BEGIN PGP SIGNATURE-----
 
 iQJOBAABCAA4FiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmckUrIaHGtlbnQub3Zl
 cnN0cmVldEBsaW51eC5kZXYACgkQE6szbY3KbnaFYA//Qd9SD8v+ypavnaogWhqk
 3bufCO8YJDV5DRQVuX/z36fia8zOKzWQGRAYvq0vF0mmgagwBE+AcBh6vvfDCxqZ
 m1937IXcv/hHh2FFQau9gWEItTH9dGwyQjeDjB3xaTL5ZTGsAdA9558ygf8GAVOe
 wD+W8Z8Qj09hAErnNS7y50t/PGbZDuG7AV2Dy2unp+fp6U0FVrZ3Z0bhFuhxcR7/
 e3j49DoW4EZL7Gu1svn7nzehjWK4wx1wX7QhynPgSOVIhdj2Fc3XG76b3mBsuZF6
 A/cBRKmSZsYL9MBK0vferqizqeuwlIJsvwpo/6zzukpyf8QOl+0IqPuAXFoz8vg3
 vrdp9cdvzWvQNexTD2+7PYosCKoUswOvo0oIy8Iopkg4VGSreZib1sZeCPzw2FBK
 AZcQaQSBLKojWpYsn9Dl2AlqEHHTvnopjr5wRXiimqKe/OcA3ugIvebUw2UE2ACp
 /Z2ZQu615BtRYQM+dRIJJQ2CAy0F3EZxIXEXwc/yrH7kL2VBay8QCKp/k/9YYy4e
 Nlxxw7alb/XGgT8GQgu24tho3yMKT621dLFOaAZ7x2HtLP8T56zL/L/wKWsocW/V
 R8Kwqot6F1EVb3Q0LECUJottYQ+5I1Et7ZpVyOPxfqF1y7KOsuxKOmZFLO7i3Spc
 fg0gOt/fyKrAF3zuSmWXne8=
 =hzm/
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2024-10-31' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Various syzbot fixes, and the more notable ones:

   - Fix for pointers in an extent overflowing the max (16) on a
     filesystem with many devices: we were creating too many cached
     copies when moving data around. Now, we only create at most one
     cached copy if there's a promote target set.

     Caching will be a bit broken for reflinked data until 6.13: I have
     larger series queued up which significantly improves the plumbing
     for data options down into the extent (bch_extent_rebalance) to fix
     this.

   - Fix for deadlock on -ENOSPC on tiny filesystems

     Allocation from the partial open_bucket list wasn't correctly
     accounting partial open_buckets as free: this fixes the main cause
     of tests timing out in the automated tests"

* tag 'bcachefs-2024-10-31' of git://evilpiepirate.org/bcachefs:
  bcachefs: Fix NULL ptr dereference in btree_node_iter_and_journal_peek
  bcachefs: fix possible null-ptr-deref in __bch2_ec_stripe_head_get()
  bcachefs: Fix deadlock on -ENOSPC w.r.t. partial open buckets
  bcachefs: Don't filter partial list buckets in open_buckets_to_text()
  bcachefs: Don't keep tons of cached pointers around
  bcachefs: init freespace inited bits to 0 in bch2_fs_initialize
  bcachefs: Fix unhandled transaction restart in fallocate
  bcachefs: Fix UAF in bch2_reconstruct_alloc()
  bcachefs: fix null-ptr-deref in have_stripes()
  bcachefs: fix shift oob in alloc_lru_idx_fragmentation
  bcachefs: Fix invalid shift in validate_sb_layout()
This commit is contained in:
Linus Torvalds 2024-11-01 07:21:03 -10:00
commit 7b83601da4
15 changed files with 161 additions and 39 deletions

View File

@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type)
static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
if (a.data_type >= BCH_DATA_NR)
return 0;
if (!data_type_movable(a.data_type) ||
!bch2_bucket_sectors_fragmented(ca, a))
return 0;

View File

@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
rcu_read_lock();
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
rcu_read_unlock();
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
ob - c->open_buckets;
@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
u64 avail;
bch2_dev_usage_read_fast(ca, &usage);
avail = dev_buckets_free(ca, usage, watermark);
avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
rcu_read_lock();
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
rcu_read_unlock();
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, ob);
@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
ob->on_partial_list = false;
rcu_read_lock();
bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
rcu_read_unlock();
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
if (ob->valid && !ob->on_partial_list &&
(!ca || ob->dev == ca->dev_idx))
if (ob->valid && (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}

View File

@ -555,6 +555,7 @@ struct bch_dev {
u64 alloc_cursor[3];
unsigned nr_open_buckets;
unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
size_t inc_gen_needs_gc;

View File

@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
if (!k.k) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "node not found at pos ");
bch2_bpos_to_text(&buf, path->pos);
prt_str(&buf, " at btree ");
bch2_btree_pos_to_text(&buf, c, l->b);
ret = bch2_fs_topology_error(c, "%s", buf.buf);
printbuf_exit(&buf);
goto err;
}
bch2_bkey_buf_reassemble(out, c, k);
@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
err:
bch2_btree_and_journal_iter_exit(&jiter);
return ret;
}

View File

@ -236,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), ptr);
rewrites_found |= 1U << i;
}
i++;
@ -284,7 +285,8 @@ restart_drop_extra_replicas:
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
bch2_extent_ptr_set_cached(c, &m->op.opts,
bkey_i_to_s(insert), &entry->ptr);
goto restart_drop_extra_replicas;
}
}
@ -295,7 +297,7 @@ restart_drop_extra_replicas:
bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, bkey_i_to_s(insert));
bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
ret = bch2_sum_sector_overwrites(trans, &iter, insert,
&should_check_enospc,
@ -558,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
struct data_update_opts data_opts)
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@ -569,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
while (data_opts.kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
while (data_opts->kill_ptrs) {
unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
data_opts.kill_ptrs ^= 1U << drop;
data_opts->kill_ptrs ^= 1U << drop;
}
/*
@ -581,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, bkey_i_to_s(n));
bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
@ -720,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
goto out;
}

View File

@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *,
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
struct data_update_opts);
struct bch_io_opts *,
struct data_update_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,

View File

@ -1870,6 +1870,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
}
h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
if (!h) {
h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
goto err;
}
found:
if (h->rw_devs_change_count != c->rw_devs_change_count)
ec_stripe_head_devs_update(c, h);

View File

@ -83,6 +83,7 @@
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOMEM, ENOMEM_stripe_head_alloc) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@ -222,6 +223,7 @@
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
x(BCH_ERR_invalid_sb, invalid_sb_members) \
x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \

View File

@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
return NULL;
}
void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_extent_ptr *ptr)
{
if (!opts->promote_target ||
!bch2_dev_in_target(c, ptr->dev, opts->promote_target))
return false;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s k,
struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
union bch_extent_entry *ec = NULL;
struct extent_ptr_decoded p;
bkey_extent_entry_for_each(ptrs, entry) {
if (&entry->ptr == ptr) {
ptr->cached = true;
if (ec)
extent_entry_drop(k, ec);
return;
}
if (extent_entry_is_stripe_ptr(entry))
ec = entry;
else if (extent_entry_is_ptr(entry))
ec = NULL;
rcu_read_lock();
if (!want_cached_ptr(c, opts, ptr)) {
bch2_bkey_drop_ptr_noerror(k, ptr);
goto out;
}
/*
* Stripes can't contain cached data, for - reasons.
*
* Possibly something we can fix in the future?
*/
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (&entry->ptr == ptr) {
if (p.has_ec)
bch2_bkey_drop_ptr_noerror(k, ptr);
else
ptr->cached = true;
goto out;
}
BUG();
out:
rcu_read_unlock();
}
/*
* bch_extent_normalize - clean up an extent, dropping stale pointers etc.
* bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
*
@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
dev_ptr_stale_rcu(ca, ptr) > 0);
(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
rcu_read_unlock();
return bkey_deleted(k.k);
}
/*
* bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
*
* Like bch2_extent_normalize(), but also only keeps a single cached pointer on
* the promote target.
*/
bool bch2_extent_normalize_by_opts(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s k)
{
struct bkey_ptrs ptrs;
bool have_cached_ptr;
rcu_read_lock();
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->cached) {
if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
bch2_bkey_drop_ptr(k, ptr);
goto restart_drop_ptrs;
}
have_cached_ptr = true;
}
rcu_read_unlock();
return bkey_deleted(k.k);

View File

@ -686,9 +686,12 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);

View File

@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_slots|BTREE_ITER_intent);
while (!ret && bkey_lt(iter.pos, end_pos)) {
while (!ret) {
s64 i_sectors_delta = 0;
struct quota_res quota_res = { 0 };
struct bkey_s_c k;
@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
if (bkey_ge(iter.pos, end_pos))
break;
ret = bch2_subvolume_get_snapshot(trans,
inode->ei_inum.subvol, &snapshot);
if (ret)
@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, true))
opts.data_replicas, true)) {
ret = drop_locks_do(trans,
(bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, false), 0));
if (ret)
goto bkey_err;
}
bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
if (ret)
@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
if (bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, true))
drop_locks_do(trans,
iter.pos.offset, true)) {
ret = drop_locks_do(trans,
bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, false));
if (ret)
goto bkey_err;
}
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))

View File

@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
}

View File

@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
@ -1002,6 +1001,7 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
struct bch_member *m;
int ret;
bch_notice(c, "initializing new filesystem");
@ -1018,6 +1018,14 @@ int bch2_fs_initialize(struct bch_fs *c)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
bch2_write_super(c);
}
for_each_member_device(c, ca) {
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
ca->mi = bch2_mi_to_cpu(m);
}
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;

View File

@ -143,6 +143,9 @@ UPGRADE_TABLE()
static int have_stripes(struct bch_fs *c)
{
if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
return 0;
return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
}

View File

@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
prt_printf(out, "Invalid superblock layout: max_size_bits too high");
return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
}
max_sectors = 1 << layout->sb_max_size_bits;
prev_offset = le64_to_cpu(layout->sb_offset[0]);